Merged upstream, fixed OSX compile errors, integrated noavx2 build into main
This commit is contained in:
commit
4faae0afa9
15 changed files with 135 additions and 75 deletions
5
.ecrc
Normal file
5
.ecrc
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
{
|
||||||
|
"Disable": {
|
||||||
|
"IndentSize": true
|
||||||
|
}
|
||||||
|
}
|
16
.editorconfig
Normal file
16
.editorconfig
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
# https://EditorConfig.org
|
||||||
|
|
||||||
|
# Top-most EditorConfig file
|
||||||
|
root = true
|
||||||
|
|
||||||
|
# Unix-style newlines with a newline ending every file, utf-8 charset
|
||||||
|
[*]
|
||||||
|
end_of_line = lf
|
||||||
|
insert_final_newline = true
|
||||||
|
trim_trailing_whitespace = true
|
||||||
|
charset = utf-8
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 4
|
||||||
|
|
||||||
|
[Makefile]
|
||||||
|
indent_style = tab
|
16
.github/ISSUE_TEMPLATE/custom.md
vendored
16
.github/ISSUE_TEMPLATE/custom.md
vendored
|
@ -22,9 +22,9 @@ Please provide a detailed written description of what you were trying to do, and
|
||||||
|
|
||||||
# Current Behavior
|
# Current Behavior
|
||||||
|
|
||||||
Please provide a detailed written description of what `llama.cpp` did, instead.
|
Please provide a detailed written description of what `llama.cpp` did, instead.
|
||||||
|
|
||||||
# Environment and Context
|
# Environment and Context
|
||||||
|
|
||||||
Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
|
Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
|
||||||
|
|
||||||
|
@ -133,7 +133,7 @@ llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin.
|
||||||
llama_model_load: .......................................................................................... done
|
llama_model_load: .......................................................................................... done
|
||||||
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
||||||
|
|
||||||
system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
|
system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
|
||||||
|
|
||||||
main: prompt: 'Please close your issue when it has been answered.'
|
main: prompt: 'Please close your issue when it has been answered.'
|
||||||
main: number of tokens in prompt = 11
|
main: number of tokens in prompt = 11
|
||||||
|
@ -166,14 +166,14 @@ main: total time = 246406.42 ms
|
||||||
|
|
||||||
Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':
|
Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':
|
||||||
|
|
||||||
3636882.89 msec task-clock # 14.677 CPUs utilized
|
3636882.89 msec task-clock # 14.677 CPUs utilized
|
||||||
13509 context-switches # 3.714 /sec
|
13509 context-switches # 3.714 /sec
|
||||||
2436 cpu-migrations # 0.670 /sec
|
2436 cpu-migrations # 0.670 /sec
|
||||||
10476679 page-faults # 2.881 K/sec
|
10476679 page-faults # 2.881 K/sec
|
||||||
13133115082869 cycles # 3.611 GHz (16.77%)
|
13133115082869 cycles # 3.611 GHz (16.77%)
|
||||||
29314462753 stalled-cycles-frontend # 0.22% frontend cycles idle (16.76%)
|
29314462753 stalled-cycles-frontend # 0.22% frontend cycles idle (16.76%)
|
||||||
10294402631459 stalled-cycles-backend # 78.39% backend cycles idle (16.74%)
|
10294402631459 stalled-cycles-backend # 78.39% backend cycles idle (16.74%)
|
||||||
23479217109614 instructions # 1.79 insn per cycle
|
23479217109614 instructions # 1.79 insn per cycle
|
||||||
# 0.44 stalled cycles per insn (16.76%)
|
# 0.44 stalled cycles per insn (16.76%)
|
||||||
2353072268027 branches # 647.002 M/sec (16.77%)
|
2353072268027 branches # 647.002 M/sec (16.77%)
|
||||||
1998682780 branch-misses # 0.08% of all branches (16.76%)
|
1998682780 branch-misses # 0.08% of all branches (16.76%)
|
||||||
|
|
17
.github/workflows/editorconfig.yml
vendored
Normal file
17
.github/workflows/editorconfig.yml
vendored
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
name: EditorConfig Checker
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
editorconfig:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
- uses: editorconfig-checker/action-editorconfig-checker@main
|
||||||
|
- run: editorconfig-checker
|
37
Makefile
37
Makefile
|
@ -34,6 +34,7 @@ endif
|
||||||
CFLAGS = -I. -Ofast -DNDEBUG -std=c11 -fPIC
|
CFLAGS = -I. -Ofast -DNDEBUG -std=c11 -fPIC
|
||||||
CXXFLAGS = -I. -I./examples -Ofast -DNDEBUG -std=c++11 -fPIC
|
CXXFLAGS = -I. -I./examples -Ofast -DNDEBUG -std=c++11 -fPIC
|
||||||
LDFLAGS =
|
LDFLAGS =
|
||||||
|
BONUSCFLAGS =
|
||||||
|
|
||||||
#lets try enabling everything
|
#lets try enabling everything
|
||||||
CFLAGS += -pthread -s
|
CFLAGS += -pthread -s
|
||||||
|
@ -71,7 +72,8 @@ endif
|
||||||
# feel free to update the Makefile for your architecture and send a pull request or issue
|
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||||
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
|
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
|
||||||
# Use all CPU extensions that are available:
|
# Use all CPU extensions that are available:
|
||||||
CFLAGS += -mf16c -mfma -mavx2 -mavx -msse3
|
CFLAGS += -mf16c -mavx -msse3
|
||||||
|
BONUSCFLAGS += -mfma -mavx2
|
||||||
endif
|
endif
|
||||||
ifneq ($(filter ppc64%,$(UNAME_M)),)
|
ifneq ($(filter ppc64%,$(UNAME_M)),)
|
||||||
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
|
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
|
||||||
|
@ -122,17 +124,19 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
OPENBLAS_BUILD =
|
OPENBLAS_BUILD =
|
||||||
|
CLBLAST_BUILD =
|
||||||
|
OPENBLAS_NOAVX2_BUILD =
|
||||||
|
|
||||||
ifeq ($(OS),Windows_NT)
|
ifeq ($(OS),Windows_NT)
|
||||||
OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) ggml_openblas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/libopenblas.lib -shared -o koboldcpp_openblas.dll $(LDFLAGS)
|
OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) ggml_openblas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/libopenblas.lib -shared -o koboldcpp_openblas.dll $(LDFLAGS)
|
||||||
else
|
|
||||||
OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. If you want to use openblas, please install it seperately, then link it manually with LLAMA_OPENBLAS=1. This is just a reminder, not an error.'
|
|
||||||
endif
|
|
||||||
|
|
||||||
CLBLAST_BUILD =
|
|
||||||
ifeq ($(OS),Windows_NT)
|
|
||||||
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) ggml_clblast.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/OpenCL.lib lib/clblast.lib -shared -o koboldcpp_clblast.dll $(LDFLAGS)
|
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) ggml_clblast.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/OpenCL.lib lib/clblast.lib -shared -o koboldcpp_clblast.dll $(LDFLAGS)
|
||||||
|
OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) ggml_openblas_noavx2.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/libopenblas.lib -shared -o koboldcpp_openblas_noavx2.dll $(LDFLAGS)
|
||||||
else
|
else
|
||||||
CLBLAST_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. If you want to use CLBlast, please install it seperately, then link it manually with LLAMA_CLBLAST=1. This is just a reminder, not an error.'
|
ifndef LLAMA_OPENBLAS
|
||||||
|
ifndef LLAMA_CLBLAST
|
||||||
|
OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. For faster speeds, install and link a BLAS library. Set LLAMA_OPENBLAS=1 to compile with OpenBLAS support or LLAMA_CLBLAST=1 to compile with ClBlast support. This is just a reminder, not an error.'
|
||||||
|
endif
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -150,22 +154,28 @@ $(info I CC: $(CCV))
|
||||||
$(info I CXX: $(CXXV))
|
$(info I CXX: $(CXXV))
|
||||||
$(info )
|
$(info )
|
||||||
|
|
||||||
default: llamalib llamalib_openblas llamalib_clblast
|
default: llamalib llamalib_openblas llamalib_openblas_noavx2 llamalib_clblast
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build library
|
# Build library
|
||||||
#
|
#
|
||||||
|
|
||||||
ggml.o: ggml.c ggml.h
|
ggml.o: ggml.c ggml.h
|
||||||
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
|
$(CC) $(CFLAGS) $(BONUSCFLAGS) -c ggml.c -o ggml.o
|
||||||
|
|
||||||
ggml_openblas.o: ggml.c ggml.h
|
ggml_openblas.o: ggml.c ggml.h
|
||||||
$(CC) $(CFLAGS) -DGGML_USE_OPENBLAS -c ggml.c -o ggml_openblas.o
|
$(CC) $(CFLAGS) $(BONUSCFLAGS) -DGGML_USE_OPENBLAS -c ggml.c -o ggml_openblas.o
|
||||||
|
|
||||||
|
ggml_openblas_noavx2.o: ggml.c ggml.h
|
||||||
|
$(CC) $(CFLAGS) -DGGML_USE_OPENBLAS -c ggml.c -o ggml_openblas_noavx2.o
|
||||||
|
|
||||||
ggml_clblast.o: ggml.c ggml.h
|
ggml_clblast.o: ggml.c ggml.h
|
||||||
$(CC) $(CFLAGS) -DGGML_USE_OPENBLAS -DGGML_USE_CLBLAST -c ggml.c -o ggml_clblast.o
|
$(CC) $(CFLAGS) $(BONUSCFLAGS) -DGGML_USE_OPENBLAS -DGGML_USE_CLBLAST -c ggml.c -o ggml_clblast.o
|
||||||
|
|
||||||
ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
|
ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
|
||||||
|
$(CC) $(CFLAGS) $(BONUSCFLAGS) -c otherarch/ggml_v1.c -o ggml_v1.o
|
||||||
|
|
||||||
|
ggml_v1_noavx2.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
|
||||||
$(CC) $(CFLAGS) -c otherarch/ggml_v1.c -o ggml_v1.o
|
$(CC) $(CFLAGS) -c otherarch/ggml_v1.c -o ggml_v1.o
|
||||||
|
|
||||||
llama.o: llama.cpp llama.h llama_internal.h
|
llama.o: llama.cpp llama.h llama_internal.h
|
||||||
|
@ -198,6 +208,9 @@ llamalib: ggml.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o
|
||||||
llamalib_openblas: ggml_openblas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o
|
llamalib_openblas: ggml_openblas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o
|
||||||
$(OPENBLAS_BUILD)
|
$(OPENBLAS_BUILD)
|
||||||
|
|
||||||
|
llamalib_openblas_noavx2: ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o
|
||||||
|
$(OPENBLAS_NOAVX2_BUILD)
|
||||||
|
|
||||||
llamalib_clblast: ggml_clblast.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o
|
llamalib_clblast: ggml_clblast.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o
|
||||||
$(CLBLAST_BUILD)
|
$(CLBLAST_BUILD)
|
||||||
|
|
||||||
|
|
|
@ -19,15 +19,15 @@ GEN_OPTIONS=(--batch_size 1024
|
||||||
--top_p 0.5)
|
--top_p 0.5)
|
||||||
|
|
||||||
if [ -n "$N_THREAD" ]; then
|
if [ -n "$N_THREAD" ]; then
|
||||||
GEN_OPTIONS+=(--threads "$N_THREAD")
|
GEN_OPTIONS+=(--threads "$N_THREAD")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
./main "${GEN_OPTIONS[@]}" \
|
./main "${GEN_OPTIONS[@]}" \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--n_predict "$N_PREDICTS" \
|
--n_predict "$N_PREDICTS" \
|
||||||
--color --interactive \
|
--color --interactive \
|
||||||
--reverse-prompt "${USER_NAME}:" \
|
--reverse-prompt "${USER_NAME}:" \
|
||||||
--prompt "
|
--prompt "
|
||||||
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer.
|
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer.
|
||||||
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
|
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
|
||||||
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
|
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
|
||||||
|
|
|
@ -22,9 +22,9 @@ extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHand
|
||||||
extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
|
extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
|
||||||
extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
|
extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
|
||||||
extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
|
extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
|
||||||
extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags,
|
extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags,
|
||||||
const wchar_t * lpWideCharStr, int cchWideChar,
|
const wchar_t * lpWideCharStr, int cchWideChar,
|
||||||
char * lpMultiByteStr, int cbMultiByte,
|
char * lpMultiByteStr, int cbMultiByte,
|
||||||
const char * lpDefaultChar, bool * lpUsedDefaultChar);
|
const char * lpDefaultChar, bool * lpUsedDefaultChar);
|
||||||
#define CP_UTF8 65001
|
#define CP_UTF8 65001
|
||||||
#endif
|
#endif
|
||||||
|
@ -328,9 +328,9 @@ void win32_console_init(bool enable_color) {
|
||||||
|
|
||||||
// Convert a wide Unicode string to an UTF8 string
|
// Convert a wide Unicode string to an UTF8 string
|
||||||
void win32_utf8_encode(const std::wstring & wstr, std::string & str) {
|
void win32_utf8_encode(const std::wstring & wstr, std::string & str) {
|
||||||
int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL);
|
int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL);
|
||||||
std::string strTo(size_needed, 0);
|
std::string strTo(size_needed, 0);
|
||||||
WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL);
|
WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL);
|
||||||
str = strTo;
|
str = strTo;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
# embedding
|
# embedding
|
||||||
|
|
||||||
TODO
|
TODO
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
# main
|
# main
|
||||||
|
|
||||||
TODO
|
TODO
|
||||||
|
|
|
@ -168,7 +168,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// enable interactive mode if reverse prompt or interactive start is specified
|
// enable interactive mode if reverse prompt or interactive start is specified
|
||||||
if (params.antiprompt.size() != 0 || params.interactive_start) {
|
if (params.antiprompt.size() != 0 || params.interactive_start) {
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
# perplexity
|
# perplexity
|
||||||
|
|
||||||
TODO
|
TODO
|
||||||
|
|
18
ggml.c
18
ggml.c
|
@ -127,9 +127,9 @@ typedef void* thread_ret_t;
|
||||||
|
|
||||||
#ifdef GGML_USE_ACCELERATE
|
#ifdef GGML_USE_ACCELERATE
|
||||||
#include <Accelerate/Accelerate.h>
|
#include <Accelerate/Accelerate.h>
|
||||||
#elif GGML_USE_OPENBLAS
|
|
||||||
#include <ggml_blas_adapter.c>
|
|
||||||
#endif
|
#endif
|
||||||
|
#include <ggml_blas_adapter.c>
|
||||||
|
|
||||||
|
|
||||||
#undef MIN
|
#undef MIN
|
||||||
#undef MAX
|
#undef MAX
|
||||||
|
@ -228,12 +228,12 @@ static inline float fp32_from_bits(uint32_t w) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uint32_t fp32_to_bits(float f) {
|
static inline uint32_t fp32_to_bits(float f) {
|
||||||
union {
|
union {
|
||||||
float as_value;
|
float as_value;
|
||||||
uint32_t as_bits;
|
uint32_t as_bits;
|
||||||
} fp32;
|
} fp32;
|
||||||
fp32.as_value = f;
|
fp32.as_value = f;
|
||||||
return fp32.as_bits;
|
return fp32.as_bits;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||||
|
@ -1881,7 +1881,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
||||||
sum1 += x1->d * y1->d * (vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3));
|
sum1 += x1->d * y1->d * (vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3));
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
|
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
|
||||||
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
|
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
|
||||||
|
|
||||||
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
|
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
//windows binaries for clblast obtained from https://github.com/CNugteren/CLBlast (apache license)
|
//windows binaries for clblast obtained from https://github.com/CNugteren/CLBlast (apache license)
|
||||||
//windows binaries for opencl obtained from https://github.com/KhronosGroup/OpenCL-SDK (apache license)
|
//windows binaries for opencl obtained from https://github.com/KhronosGroup/OpenCL-SDK (apache license)
|
||||||
|
|
||||||
|
#if GGML_USE_OPENBLAS
|
||||||
#include <cblas.h>
|
#include <cblas.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
@ -104,21 +105,16 @@ static void ggml_cl_sgemm_wrapper(const enum CBLAS_ORDER order, const enum CBLAS
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void do_blas_sgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
|
|
||||||
OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc)
|
|
||||||
{
|
|
||||||
#if GGML_USE_CLBLAST
|
|
||||||
ggml_cl_sgemm_wrapper(Order, TransA, TransB,
|
|
||||||
M, N, K,
|
|
||||||
alpha, A, lda,
|
|
||||||
B, ldb,
|
|
||||||
beta, C, ldc);
|
|
||||||
#else
|
|
||||||
cblas_sgemm(Order, TransA, TransB,
|
|
||||||
M, N, K,
|
|
||||||
alpha, A, lda,
|
|
||||||
B, ldb,
|
|
||||||
beta, C, ldc);
|
|
||||||
#endif
|
#endif
|
||||||
}
|
|
||||||
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||||
|
#if GGML_USE_CLBLAST
|
||||||
|
#define do_blas_sgemm(Order, TransA, TransB,M, N, K,alpha, A, lda, B, ldb, beta, C, ldc) ({\
|
||||||
|
ggml_cl_sgemm_wrapper(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);\
|
||||||
|
})
|
||||||
|
#else
|
||||||
|
#define do_blas_sgemm(Order, TransA, TransB,M, N, K,alpha, A, lda, B, ldb, beta, C, ldc) ({\
|
||||||
|
cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);\
|
||||||
|
})
|
||||||
|
#endif
|
||||||
|
#endif
|
23
koboldcpp.py
23
koboldcpp.py
|
@ -36,11 +36,14 @@ class generation_outputs(ctypes.Structure):
|
||||||
handle = None
|
handle = None
|
||||||
use_blas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir.
|
use_blas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir.
|
||||||
use_clblast = False #uses CLBlast instead
|
use_clblast = False #uses CLBlast instead
|
||||||
|
use_noavx2 = False #uses openblas with no avx2 instructions
|
||||||
|
|
||||||
def init_library():
|
def init_library():
|
||||||
global handle, use_blas, use_clblast
|
global handle, use_blas, use_clblast, use_noavx2
|
||||||
libname = ""
|
libname = ""
|
||||||
if use_blas:
|
if use_noavx2:
|
||||||
|
libname = "koboldcpp_openblas_noavx2.dll"
|
||||||
|
elif use_blas:
|
||||||
libname = "koboldcpp_openblas.dll"
|
libname = "koboldcpp_openblas.dll"
|
||||||
elif use_clblast:
|
elif use_clblast:
|
||||||
libname = "koboldcpp_clblast.dll"
|
libname = "koboldcpp_clblast.dll"
|
||||||
|
@ -309,7 +312,7 @@ def RunServerMultiThreaded(addr, port, embedded_kailite = None):
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
global use_blas, use_clblast
|
global use_blas, use_clblast, use_noavx2
|
||||||
if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll")) or not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "koboldcpp_openblas.dll")):
|
if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll")) or not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "koboldcpp_openblas.dll")):
|
||||||
print("Warning: libopenblas.dll or koboldcpp_openblas.dll not found. Non-BLAS library will be used. Ignore this if you have manually linked with OpenBLAS.")
|
print("Warning: libopenblas.dll or koboldcpp_openblas.dll not found. Non-BLAS library will be used. Ignore this if you have manually linked with OpenBLAS.")
|
||||||
use_blas = False
|
use_blas = False
|
||||||
|
@ -322,6 +325,14 @@ def main(args):
|
||||||
else:
|
else:
|
||||||
print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast.dll will be required.")
|
print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast.dll will be required.")
|
||||||
use_clblast = True
|
use_clblast = True
|
||||||
|
elif args.noavx2:
|
||||||
|
if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll")) or not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "koboldcpp_openblas_noavx2.dll")):
|
||||||
|
print("Warning: libopenblas.dll or koboldcpp_openblas_noavx2.dll not found. This mode cannot be used.")
|
||||||
|
elif os.name == 'nt':
|
||||||
|
print("Attempting to use non-avx2 compatibility openblas library.")
|
||||||
|
use_noavx2 = True
|
||||||
|
else:
|
||||||
|
print("Non-AVX2 compatibility OpenBLAS mode only available on windows. On other OS, please manually rebuild without AVX2 flags.")
|
||||||
elif not args.noblas:
|
elif not args.noblas:
|
||||||
print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas.dll will be required.")
|
print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas.dll will be required.")
|
||||||
use_blas = True
|
use_blas = True
|
||||||
|
@ -409,8 +420,10 @@ if __name__ == '__main__':
|
||||||
parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
|
parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
|
||||||
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
|
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
|
||||||
parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
|
parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
|
||||||
parser.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
|
|
||||||
parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
|
parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
|
||||||
parser.add_argument("--useclblast", help="Use CLBlast instead of OpenBLAS for prompt ingestion. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
|
compatgroup = parser.add_mutually_exclusive_group()
|
||||||
|
compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
|
||||||
|
compatgroup.add_argument("--noavx2", help="Do not use AVX2 instructions, a slower compatibility mode for older devices. Does not work with --noblas or --clblast.", action='store_true')
|
||||||
|
compatgroup.add_argument("--useclblast", help="Use CLBlast instead of OpenBLAS for prompt ingestion. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
pyinstaller --noconfirm --onefile --clean --console --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." "./koboldcpp.py" -n "koboldcpp.exe"
|
pyinstaller --noconfirm --onefile --clean --console --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_openblas_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." "./koboldcpp.py" -n "koboldcpp.exe"
|
Loading…
Add table
Add a link
Reference in a new issue