From 6c2134a860438c4c681271a69c7486f52fc9dae8 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 25 Sep 2023 22:10:47 +0800 Subject: [PATCH] improved makefile, allowing building without k quants --- Makefile | 55 +++++++++++++++++++++++++++++++++++++++++++++------- koboldcpp.py | 12 +++++------- 2 files changed, 53 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 440ff6112..740667166 100644 --- a/Makefile +++ b/Makefile @@ -39,10 +39,15 @@ endif # # keep standard at C11 and C++11 -CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE -CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE +CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE +CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE LDFLAGS = +ifndef LLAMA_NO_K_QUANTS +CFLAGS += -DGGML_USE_K_QUANTS +CXXFLAGS += -DGGML_USE_K_QUANTS +endif + # these are used on windows, to build some libraries with extra old device compatibility SIMPLECFLAGS = FULLCFLAGS = @@ -285,19 +290,17 @@ ifeq ($(OS),Windows_NT) endif else DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS) - FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS) + ifdef LLAMA_OPENBLAS OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) - NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) endif ifdef LLAMA_CLBLAST ifeq ($(UNAME_S),Darwin) - CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) + CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) else - CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) + CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) endif endif - ifdef LLAMA_CUBLAS CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.so $(CUBLASLD_FLAGS) $(LDFLAGS) endif @@ -351,12 +354,18 @@ ggml_cublas.o: ggml.c ggml.h ggml-cuda.h k_quants.h $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ #quants K +ifndef LLAMA_NO_K_QUANTS k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@ k_quants_noavx2.o: k_quants.c k_quants.h ggml.h ggml-cuda.h $(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@ k_quants_failsafe.o: k_quants.c k_quants.h ggml.h ggml-cuda.h $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@ +else +k_quants.o: +k_quants_noavx2.o: +k_quants_failsafe.o: +endif # LLAMA_NO_K_QUANTS #there's no intrinsics or special gpu ops used here, so we can have a universal object ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h @@ -425,22 +434,54 @@ main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + #generated libraries koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS) $(DEFAULT_BUILD) + +ifdef OPENBLAS_BUILD koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS) $(OPENBLAS_BUILD) +else +koboldcpp_openblas: +endif + +ifdef FAILSAFE_BUILD koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_failsafe.o ggml-alloc.o grammar-parser.o $(OBJS) $(FAILSAFE_BUILD) +else +koboldcpp_failsafe: +endif + +ifdef NOAVX2_BUILD koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o ggml-alloc.o grammar-parser.o $(OBJS) $(NOAVX2_BUILD) +else +koboldcpp_noavx2: +endif + +ifdef CLBLAST_BUILD koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS) $(CLBLAST_BUILD) +else +koboldcpp_clblast: +endif + +ifdef CUBLAS_BUILD koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS) $(CUBLAS_BUILD) +else +koboldcpp_cublas: +endif + +ifdef HIPBLAS_BUILD koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o grammar-parser.o $(HIP_OBJS) $(OBJS) $(HIPBLAS_BUILD) +else +koboldcpp_hipblas: +endif +# tools quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o ggml-alloc.o $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) quantize_gptj: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp diff --git a/koboldcpp.py b/koboldcpp.py index 154c43dbb..531e89f43 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -576,9 +576,6 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): global modelbusy, requestsinqueue content_length = int(self.headers['Content-Length']) body = self.rfile.read(content_length) - basic_api_flag = False - kai_api_flag = False - kai_sse_stream_flag = False self.path = self.path.rstrip('/') if self.path.endswith(('/api/extra/tokencount')): @@ -634,6 +631,9 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): requestsinqueue = (requestsinqueue - 1) if requestsinqueue>0 else 0 try: + basic_api_flag = False + kai_api_flag = False + kai_sse_stream_flag = False if self.path.endswith('/request'): basic_api_flag = True @@ -1670,7 +1670,7 @@ def run_horde_worker(args, api_key, worker_name): time.sleep(3) else: print_with_time("Horde Worker Shutdown - Server Closing.") - time.sleep(2) + time.sleep(3) sys.exit(2) def unload_libs(): @@ -1752,7 +1752,7 @@ def main(launch_args,start_server=True): setattr(args, key, value) else: print("Specified kcpp config file invalid or not found.") - time.sleep(2) + time.sleep(3) sys.exit(2) if not args.model_param: args.model_param = args.model @@ -1896,8 +1896,6 @@ def main(launch_args,start_server=True): asyncio.run(RunServerMultiThreaded(args.host, args.port, embedded_kailite)) else: print(f"Server was not started, main function complete. Idling.") - # while True: - # time.sleep(5) if __name__ == '__main__': print("***\nWelcome to KoboldCpp - Version " + KcppVersion) # just update version manually