diff --git a/Makefile b/Makefile index 278ab6361..9b616294d 100644 --- a/Makefile +++ b/Makefile @@ -72,8 +72,13 @@ endif # feel free to update the Makefile for your architecture and send a pull request or issue ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686)) # Use all CPU extensions that are available: - CFLAGS += -mf16c -mavx -msse3 - BONUSCFLAGS += -mfma -mavx2 + CFLAGS += -mavx + ifeq ($(OS),Windows_NT) + BONUSCFLAGS += -mfma -mavx2 -mf16c -msse3 + else +# if not on windows, they are clearly building it themselves, so lets just use whatever is supported + CFLAGS += -march=native -mtune=native + endif endif ifneq ($(filter ppc64%,$(UNAME_M)),) POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index e04f04c40..667f8b1d5 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -224,7 +224,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o int original_threads = params.n_threads; if (blasmode) { - params.n_batch = 1024; + params.n_batch = 512; //received reports of 1024 and above crashing on some models params.n_threads = 1; } diff --git a/llama_adapter.cpp b/llama_adapter.cpp index 93f46fae6..6e437eb6c 100644 --- a/llama_adapter.cpp +++ b/llama_adapter.cpp @@ -160,7 +160,7 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out int original_threads = params.n_threads; if (blasmode) { - params.n_batch = 1024; + params.n_batch = 512; //received reports of 1024 and above crashing on some models params.n_threads = 1; }