improved makefile, allowing building without k quants

This commit is contained in:
Concedo 2023-09-25 22:10:47 +08:00
parent 17ee719c56
commit 6c2134a860
2 changed files with 53 additions and 14 deletions

View file

@ -39,10 +39,15 @@ endif
# #
# keep standard at C11 and C++11 # keep standard at C11 and C++11
CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
LDFLAGS = LDFLAGS =
ifndef LLAMA_NO_K_QUANTS
CFLAGS += -DGGML_USE_K_QUANTS
CXXFLAGS += -DGGML_USE_K_QUANTS
endif
# these are used on windows, to build some libraries with extra old device compatibility # these are used on windows, to build some libraries with extra old device compatibility
SIMPLECFLAGS = SIMPLECFLAGS =
FULLCFLAGS = FULLCFLAGS =
@ -285,19 +290,17 @@ ifeq ($(OS),Windows_NT)
endif endif
else else
DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS) DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS)
FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS)
ifdef LLAMA_OPENBLAS ifdef LLAMA_OPENBLAS
OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
endif endif
ifdef LLAMA_CLBLAST ifdef LLAMA_CLBLAST
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
else else
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
endif endif
endif endif
ifdef LLAMA_CUBLAS ifdef LLAMA_CUBLAS
CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.so $(CUBLASLD_FLAGS) $(LDFLAGS) CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.so $(CUBLASLD_FLAGS) $(LDFLAGS)
endif endif
@ -351,12 +354,18 @@ ggml_cublas.o: ggml.c ggml.h ggml-cuda.h k_quants.h
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
#quants K #quants K
ifndef LLAMA_NO_K_QUANTS
k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
$(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@ $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
k_quants_noavx2.o: k_quants.c k_quants.h ggml.h ggml-cuda.h k_quants_noavx2.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
$(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@ $(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
k_quants_failsafe.o: k_quants.c k_quants.h ggml.h ggml-cuda.h k_quants_failsafe.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@ $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
else
k_quants.o:
k_quants_noavx2.o:
k_quants_failsafe.o:
endif # LLAMA_NO_K_QUANTS
#there's no intrinsics or special gpu ops used here, so we can have a universal object #there's no intrinsics or special gpu ops used here, so we can have a universal object
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
@ -425,22 +434,54 @@ main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o
gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS) gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
#generated libraries #generated libraries
koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS) koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS)
$(DEFAULT_BUILD) $(DEFAULT_BUILD)
ifdef OPENBLAS_BUILD
koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS) koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS)
$(OPENBLAS_BUILD) $(OPENBLAS_BUILD)
else
koboldcpp_openblas:
endif
ifdef FAILSAFE_BUILD
koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_failsafe.o ggml-alloc.o grammar-parser.o $(OBJS) koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_failsafe.o ggml-alloc.o grammar-parser.o $(OBJS)
$(FAILSAFE_BUILD) $(FAILSAFE_BUILD)
else
koboldcpp_failsafe:
endif
ifdef NOAVX2_BUILD
koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o ggml-alloc.o grammar-parser.o $(OBJS) koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o ggml-alloc.o grammar-parser.o $(OBJS)
$(NOAVX2_BUILD) $(NOAVX2_BUILD)
else
koboldcpp_noavx2:
endif
ifdef CLBLAST_BUILD
koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS) koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS)
$(CLBLAST_BUILD) $(CLBLAST_BUILD)
else
koboldcpp_clblast:
endif
ifdef CUBLAS_BUILD
koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS) koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
$(CUBLAS_BUILD) $(CUBLAS_BUILD)
else
koboldcpp_cublas:
endif
ifdef HIPBLAS_BUILD
koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o grammar-parser.o $(HIP_OBJS) $(OBJS) koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o grammar-parser.o $(HIP_OBJS) $(OBJS)
$(HIPBLAS_BUILD) $(HIPBLAS_BUILD)
else
koboldcpp_hipblas:
endif
# tools
quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o ggml-alloc.o quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o ggml-alloc.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_gptj: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp quantize_gptj: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp

View file

@ -576,9 +576,6 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
global modelbusy, requestsinqueue global modelbusy, requestsinqueue
content_length = int(self.headers['Content-Length']) content_length = int(self.headers['Content-Length'])
body = self.rfile.read(content_length) body = self.rfile.read(content_length)
basic_api_flag = False
kai_api_flag = False
kai_sse_stream_flag = False
self.path = self.path.rstrip('/') self.path = self.path.rstrip('/')
if self.path.endswith(('/api/extra/tokencount')): if self.path.endswith(('/api/extra/tokencount')):
@ -634,6 +631,9 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
requestsinqueue = (requestsinqueue - 1) if requestsinqueue>0 else 0 requestsinqueue = (requestsinqueue - 1) if requestsinqueue>0 else 0
try: try:
basic_api_flag = False
kai_api_flag = False
kai_sse_stream_flag = False
if self.path.endswith('/request'): if self.path.endswith('/request'):
basic_api_flag = True basic_api_flag = True
@ -1670,7 +1670,7 @@ def run_horde_worker(args, api_key, worker_name):
time.sleep(3) time.sleep(3)
else: else:
print_with_time("Horde Worker Shutdown - Server Closing.") print_with_time("Horde Worker Shutdown - Server Closing.")
time.sleep(2) time.sleep(3)
sys.exit(2) sys.exit(2)
def unload_libs(): def unload_libs():
@ -1752,7 +1752,7 @@ def main(launch_args,start_server=True):
setattr(args, key, value) setattr(args, key, value)
else: else:
print("Specified kcpp config file invalid or not found.") print("Specified kcpp config file invalid or not found.")
time.sleep(2) time.sleep(3)
sys.exit(2) sys.exit(2)
if not args.model_param: if not args.model_param:
args.model_param = args.model args.model_param = args.model
@ -1896,8 +1896,6 @@ def main(launch_args,start_server=True):
asyncio.run(RunServerMultiThreaded(args.host, args.port, embedded_kailite)) asyncio.run(RunServerMultiThreaded(args.host, args.port, embedded_kailite))
else: else:
print(f"Server was not started, main function complete. Idling.") print(f"Server was not started, main function complete. Idling.")
# while True:
# time.sleep(5)
if __name__ == '__main__': if __name__ == '__main__':
print("***\nWelcome to KoboldCpp - Version " + KcppVersion) # just update version manually print("***\nWelcome to KoboldCpp - Version " + KcppVersion) # just update version manually