Merge branch 'master' into master
This commit is contained in:
commit
eef66e1d2e
14 changed files with 617 additions and 523 deletions
10
.gitignore
vendored
10
.gitignore
vendored
|
@ -62,6 +62,16 @@ perf-*.txt
|
||||||
|
|
||||||
examples/jeopardy/results.txt
|
examples/jeopardy/results.txt
|
||||||
|
|
||||||
|
|
||||||
pyproject.toml
|
pyproject.toml
|
||||||
poetry.lock
|
poetry.lock
|
||||||
poetry.toml
|
poetry.toml
|
||||||
|
|
||||||
|
# Test binaries
|
||||||
|
tests/test-double-float
|
||||||
|
tests/test-grad0
|
||||||
|
tests/test-opt
|
||||||
|
tests/test-quantize-fns
|
||||||
|
tests/test-quantize-perf
|
||||||
|
tests/test-sampling
|
||||||
|
tests/test-tokenizer-0
|
|
@ -186,16 +186,7 @@ if (LLAMA_BLAS)
|
||||||
pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
|
pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
|
||||||
elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
|
elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
|
||||||
# all Intel* libraries share the same include path
|
# all Intel* libraries share the same include path
|
||||||
pkg_check_modules(DepBLAS mkl-sdl)
|
pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
|
||||||
if (NOT DepBLAS)
|
|
||||||
if (BUILD_SHARED_LIBS)
|
|
||||||
set(LINK_METHOD dynamic)
|
|
||||||
else()
|
|
||||||
set(LINK_METHOD static)
|
|
||||||
endif()
|
|
||||||
string(REGEX REPLACE ".*_" "" DATA_TYPE_MODEL ${LLAMA_BLAS_VENDOR})
|
|
||||||
pkg_check_modules(DepBLAS REQUIRED mkl-${LINK_METHOD}-${DATA_TYPE_MODEL}-iomp)
|
|
||||||
endif()
|
|
||||||
elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
|
elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
|
||||||
# this doesn't provide pkg-config
|
# this doesn't provide pkg-config
|
||||||
# suggest to assign BLAS_INCLUDE_DIRS on your own
|
# suggest to assign BLAS_INCLUDE_DIRS on your own
|
||||||
|
|
72
Makefile
72
Makefile
|
@ -1,5 +1,8 @@
|
||||||
# Define the default target now so that it is always the first target
|
# Define the default target now so that it is always the first target
|
||||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server libembdinput.so embd-input-test
|
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test
|
||||||
|
|
||||||
|
# Binaries only useful for tests
|
||||||
|
TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
|
||||||
|
|
||||||
default: $(BUILD_TARGETS)
|
default: $(BUILD_TARGETS)
|
||||||
|
|
||||||
|
@ -90,6 +93,28 @@ ifeq ($(UNAME_S),Haiku)
|
||||||
CXXFLAGS += -pthread
|
CXXFLAGS += -pthread
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# detect Windows
|
||||||
|
ifneq ($(findstring _NT,$(UNAME_S)),)
|
||||||
|
_WIN32 := 1
|
||||||
|
endif
|
||||||
|
|
||||||
|
# library name prefix
|
||||||
|
ifneq ($(_WIN32),1)
|
||||||
|
LIB_PRE := lib
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Dynamic Shared Object extension
|
||||||
|
ifneq ($(_WIN32),1)
|
||||||
|
DSO_EXT := .so
|
||||||
|
else
|
||||||
|
DSO_EXT := .dll
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Windows Sockets 2 (Winsock) for network-capable apps
|
||||||
|
ifeq ($(_WIN32),1)
|
||||||
|
LWINSOCK2 := -lws2_32
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_GPROF
|
ifdef LLAMA_GPROF
|
||||||
CFLAGS += -pg
|
CFLAGS += -pg
|
||||||
CXXFLAGS += -pg
|
CXXFLAGS += -pg
|
||||||
|
@ -168,8 +193,12 @@ ifdef LLAMA_CUBLAS
|
||||||
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||||
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
||||||
OBJS += ggml-cuda.o
|
OBJS += ggml-cuda.o
|
||||||
NVCC = nvcc
|
|
||||||
NVCCFLAGS = --forward-unknown-to-host-compiler
|
NVCCFLAGS = --forward-unknown-to-host-compiler
|
||||||
|
ifdef LLAMA_CUDA_NVCC
|
||||||
|
NVCC = $(LLAMA_CUDA_NVCC)
|
||||||
|
else
|
||||||
|
NVCC = nvcc
|
||||||
|
endif #LLAMA_CUDA_NVCC
|
||||||
ifdef CUDA_DOCKER_ARCH
|
ifdef CUDA_DOCKER_ARCH
|
||||||
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
|
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
|
||||||
else
|
else
|
||||||
|
@ -198,7 +227,9 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
|
||||||
else
|
else
|
||||||
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
||||||
endif
|
endif
|
||||||
|
ifdef LLAMA_CUDA_CCBIN
|
||||||
|
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
||||||
|
endif
|
||||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||||
endif # LLAMA_CUBLAS
|
endif # LLAMA_CUBLAS
|
||||||
|
@ -294,7 +325,7 @@ libllama.so: llama.o ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h
|
rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Examples
|
# Examples
|
||||||
|
@ -325,14 +356,14 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
|
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||||
|
|
||||||
libembdinput.so: examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
|
||||||
embd-input-test: libembdinput.so examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
|
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
|
||||||
|
|
||||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
@ -349,6 +380,8 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||||
# Tests
|
# Tests
|
||||||
#
|
#
|
||||||
|
|
||||||
|
tests: $(TEST_TARGETS)
|
||||||
|
|
||||||
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
|
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
./$@
|
./$@
|
||||||
|
@ -356,6 +389,23 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
|
||||||
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
.PHONY: tests clean
|
tests/test-double-float: tests/test-double-float.c build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
tests:
|
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||||
bash ./tests/run-tests.sh
|
|
||||||
|
tests/test-grad0: tests/test-grad0.c build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
tests/test-opt: tests/test-opt.c build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
tests/test-tokenizer-0: tests/test-tokenizer-0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
|
@ -360,7 +360,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
```bash
|
```bash
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_lp64 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -243,7 +243,7 @@ function gg_sum_open_llama_3b_v2 {
|
||||||
if [ -z $GG_BUILD_LOW_PERF ]; then
|
if [ -z $GG_BUILD_LOW_PERF ]; then
|
||||||
rm -rf ${SRC}/models-mnt
|
rm -rf ${SRC}/models-mnt
|
||||||
|
|
||||||
mnt_models=$(realpath ${MNT}/models)
|
mnt_models=${MNT}/models
|
||||||
mkdir -p ${mnt_models}
|
mkdir -p ${mnt_models}
|
||||||
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
||||||
|
|
||||||
|
|
|
@ -2,21 +2,21 @@
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
AI_NAME="${AI_NAME:-Miku}"
|
AI_NAME="${AI_NAME:-Miku}"
|
||||||
MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}"
|
MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
|
||||||
USER_NAME="${USER_NAME:-Anon}"
|
USER_NAME="${USER_NAME:-Anon}"
|
||||||
|
|
||||||
# Uncomment and adjust to the number of CPU cores you want to use.
|
# Uncomment and adjust to the number of CPU cores you want to use.
|
||||||
#N_THREAD="${N_THREAD:-4}"
|
#N_THREAD="${N_THREAD:-4}"
|
||||||
|
CTX_SIZE="${CTX_SIZE:-4096}"
|
||||||
N_PREDICTS="${N_PREDICTS:-4096}"
|
N_PREDICTS="${N_PREDICTS:-4096}"
|
||||||
|
|
||||||
GEN_OPTIONS=(--batch_size 1024
|
GEN_OPTIONS=(--batch_size 1024
|
||||||
--ctx_size 2048
|
--ctx_size "$CTX_SIZE"
|
||||||
--keep -1
|
--keep -1
|
||||||
--repeat_last_n 256
|
--repeat_last_n 256
|
||||||
--repeat_penalty 1.17647
|
--repeat_penalty 1.17647
|
||||||
--temp 0.7
|
--temp 0.6
|
||||||
--top_k 40
|
--mirostat 2)
|
||||||
--top_p 0.5)
|
|
||||||
|
|
||||||
if [ -n "$N_THREAD" ]; then
|
if [ -n "$N_THREAD" ]; then
|
||||||
GEN_OPTIONS+=(--threads "$N_THREAD")
|
GEN_OPTIONS+=(--threads "$N_THREAD")
|
||||||
|
@ -24,16 +24,17 @@ fi
|
||||||
|
|
||||||
./main "${GEN_OPTIONS[@]}" \
|
./main "${GEN_OPTIONS[@]}" \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
|
--in-prefix " " \
|
||||||
|
--in-suffix "${AI_NAME}:" \
|
||||||
--n_predict "$N_PREDICTS" \
|
--n_predict "$N_PREDICTS" \
|
||||||
--color --interactive \
|
--color --interactive \
|
||||||
--reverse-prompt "${USER_NAME}:" \
|
--reverse-prompt "${USER_NAME}:" \
|
||||||
--prompt "
|
--prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
|
||||||
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
|
|
||||||
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
|
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
|
||||||
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
|
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
|
||||||
${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
|
${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
|
||||||
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
|
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
|
||||||
The conversation is only between ${USER_NAME} and ${AI_NAME}
|
The conversation is only between ${USER_NAME} and ${AI_NAME}.
|
||||||
The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
|
The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
|
||||||
${AI_NAME} can only communicate through text, so she can't send images or videos.
|
${AI_NAME} can only communicate through text, so she can't send images or videos.
|
||||||
|
|
||||||
|
|
|
@ -586,7 +586,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
lparams.n_batch = params.n_batch;
|
lparams.n_batch = params.n_batch;
|
||||||
lparams.n_gpu_layers = params.n_gpu_layers;
|
lparams.n_gpu_layers = params.n_gpu_layers;
|
||||||
lparams.main_gpu = params.main_gpu;
|
lparams.main_gpu = params.main_gpu;
|
||||||
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
|
lparams.tensor_split = params.tensor_split;
|
||||||
lparams.low_vram = params.low_vram;
|
lparams.low_vram = params.low_vram;
|
||||||
lparams.seed = params.seed;
|
lparams.seed = params.seed;
|
||||||
lparams.f16_kv = params.memory_f16;
|
lparams.f16_kv = params.memory_f16;
|
||||||
|
|
|
@ -7,6 +7,9 @@ target_compile_definitions(${TARGET} PRIVATE
|
||||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||||
)
|
)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
if (WIN32)
|
||||||
|
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||||
|
endif()
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
if(TARGET BUILD_INFO)
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
add_dependencies(${TARGET} BUILD_INFO)
|
||||||
|
|
27
flake.nix
27
flake.nix
|
@ -6,7 +6,7 @@
|
||||||
outputs = { self, nixpkgs, flake-utils }:
|
outputs = { self, nixpkgs, flake-utils }:
|
||||||
flake-utils.lib.eachDefaultSystem (system:
|
flake-utils.lib.eachDefaultSystem (system:
|
||||||
let
|
let
|
||||||
inherit (pkgs.stdenv) isAarch32 isAarch64 isx86_32 isx86_64 isDarwin;
|
inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
|
||||||
osSpecific = with pkgs; [ openmpi ] ++
|
osSpecific = with pkgs; [ openmpi ] ++
|
||||||
(
|
(
|
||||||
if isAarch64 && isDarwin then
|
if isAarch64 && isDarwin then
|
||||||
|
@ -22,14 +22,13 @@
|
||||||
CoreGraphics
|
CoreGraphics
|
||||||
CoreVideo
|
CoreVideo
|
||||||
]
|
]
|
||||||
else if isx86_32 || isx86_64 then
|
|
||||||
with pkgs; [ mkl ]
|
|
||||||
else
|
else
|
||||||
with pkgs; [ openblas ]
|
with pkgs; [ openblas ]
|
||||||
);
|
);
|
||||||
pkgs = import nixpkgs { inherit system; };
|
pkgs = import nixpkgs { inherit system; };
|
||||||
|
nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
|
||||||
llama-python =
|
llama-python =
|
||||||
pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
||||||
in {
|
in {
|
||||||
packages.default = pkgs.stdenv.mkDerivation {
|
packages.default = pkgs.stdenv.mkDerivation {
|
||||||
name = "llama.cpp";
|
name = "llama.cpp";
|
||||||
|
@ -37,33 +36,21 @@
|
||||||
postPatch = ''
|
postPatch = ''
|
||||||
substituteInPlace ./ggml-metal.m \
|
substituteInPlace ./ggml-metal.m \
|
||||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||||
|
substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python'
|
||||||
'';
|
'';
|
||||||
nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
|
nativeBuildInputs = nativeBuildInputs;
|
||||||
buildInputs = osSpecific;
|
buildInputs = osSpecific;
|
||||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ]
|
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ]
|
||||||
++ (if isAarch64 && isDarwin then [
|
++ (if isAarch64 && isDarwin then [
|
||||||
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
||||||
"-DLLAMA_METAL=ON"
|
"-DLLAMA_METAL=ON"
|
||||||
] else if isx86_32 || isx86_64 then [
|
|
||||||
"-DLLAMA_BLAS=ON"
|
|
||||||
"-DLLAMA_BLAS_VENDOR=Intel10_lp64"
|
|
||||||
] else [
|
] else [
|
||||||
"-DLLAMA_BLAS=ON"
|
"-DLLAMA_BLAS=ON"
|
||||||
"-DLLAMA_BLAS_VENDOR=OpenBLAS"
|
"-DLLAMA_BLAS_VENDOR=OpenBLAS"
|
||||||
]);
|
]);
|
||||||
installPhase = ''
|
postInstall = ''
|
||||||
runHook preInstall
|
|
||||||
|
|
||||||
install -D bin/* -t $out/bin
|
|
||||||
install -Dm644 lib*.so -t $out/lib
|
|
||||||
mv $out/bin/main $out/bin/llama
|
mv $out/bin/main $out/bin/llama
|
||||||
mv $out/bin/server $out/bin/llama-server
|
mv $out/bin/server $out/bin/llama-server
|
||||||
|
|
||||||
echo "#!${llama-python}/bin/python" > $out/bin/convert.py
|
|
||||||
cat ${./convert.py} >> $out/bin/convert.py
|
|
||||||
chmod +x $out/bin/convert.py
|
|
||||||
|
|
||||||
runHook postInstall
|
|
||||||
'';
|
'';
|
||||||
meta.mainProgram = "llama";
|
meta.mainProgram = "llama";
|
||||||
};
|
};
|
||||||
|
@ -81,7 +68,7 @@
|
||||||
};
|
};
|
||||||
apps.default = self.apps.${system}.llama;
|
apps.default = self.apps.${system}.llama;
|
||||||
devShells.default = pkgs.mkShell {
|
devShells.default = pkgs.mkShell {
|
||||||
packages = with pkgs; [ cmake llama-python ] ++ osSpecific;
|
packages = nativeBuildInputs ++ osSpecific;
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -2512,6 +2512,9 @@ void ggml_init_cublas() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_set_tensor_split(const float * tensor_split) {
|
void ggml_cuda_set_tensor_split(const float * tensor_split) {
|
||||||
|
if (tensor_split == nullptr) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
bool all_zero = true;
|
bool all_zero = true;
|
||||||
for (int i = 0; i < g_device_count; ++i) {
|
for (int i = 0; i < g_device_count; ++i) {
|
||||||
if (tensor_split[i] != 0.0f) {
|
if (tensor_split[i] != 0.0f) {
|
||||||
|
|
35
ggml-metal.m
35
ggml-metal.m
|
@ -676,8 +676,8 @@ void ggml_metal_graph_compute(
|
||||||
GGML_ASSERT(ne02 == 1);
|
GGML_ASSERT(ne02 == 1);
|
||||||
GGML_ASSERT(ne12 == 1);
|
GGML_ASSERT(ne12 == 1);
|
||||||
|
|
||||||
nth0 = 4;
|
nth0 = 2;
|
||||||
nth1 = 16;
|
nth1 = 32;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
|
@ -694,8 +694,8 @@ void ggml_metal_graph_compute(
|
||||||
GGML_ASSERT(ne02 == 1);
|
GGML_ASSERT(ne02 == 1);
|
||||||
GGML_ASSERT(ne12 == 1);
|
GGML_ASSERT(ne12 == 1);
|
||||||
|
|
||||||
nth0 = 4;
|
nth0 = 2;
|
||||||
nth1 = 16;
|
nth1 = 32;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
|
@ -703,8 +703,8 @@ void ggml_metal_graph_compute(
|
||||||
GGML_ASSERT(ne02 == 1);
|
GGML_ASSERT(ne02 == 1);
|
||||||
GGML_ASSERT(ne12 == 1);
|
GGML_ASSERT(ne12 == 1);
|
||||||
|
|
||||||
nth0 = 4;
|
nth0 = 2;
|
||||||
nth1 = 16;
|
nth1 = 32;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
|
@ -712,8 +712,8 @@ void ggml_metal_graph_compute(
|
||||||
GGML_ASSERT(ne02 == 1);
|
GGML_ASSERT(ne02 == 1);
|
||||||
GGML_ASSERT(ne12 == 1);
|
GGML_ASSERT(ne12 == 1);
|
||||||
|
|
||||||
nth0 = 4;
|
nth0 = 2;
|
||||||
nth1 = 16;
|
nth1 = 32;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
|
@ -739,14 +739,17 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
||||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
||||||
|
|
||||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
|
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
||||||
|
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
}
|
}
|
||||||
else if (src0t == GGML_TYPE_Q2_K ||
|
else if (src0t == GGML_TYPE_Q5_K) {
|
||||||
src0t == GGML_TYPE_Q3_K ||
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
src0t == GGML_TYPE_Q4_K ||
|
}
|
||||||
src0t == GGML_TYPE_Q5_K ||
|
else if (src0t == GGML_TYPE_Q6_K) {
|
||||||
src0t == GGML_TYPE_Q6_K) {
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
|
}
|
||||||
|
else if (src0t == GGML_TYPE_Q3_K) {
|
||||||
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
} else {
|
} else {
|
||||||
|
@ -792,7 +795,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
const float eps = 1e-6f;
|
const float eps = 1e-6f;
|
||||||
|
|
||||||
const int nth = 256;
|
const int nth = 512;
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_rms_norm];
|
[encoder setComputePipelineState:ctx->pipeline_rms_norm];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
@ -800,7 +803,7 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
||||||
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
||||||
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
[encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
|
||||||
|
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
const int64_t nrows = ggml_nrows(src0);
|
||||||
|
|
||||||
|
|
911
ggml-metal.metal
911
ggml-metal.metal
File diff suppressed because it is too large
Load diff
|
@ -555,7 +555,9 @@ struct llama_file_loader {
|
||||||
}
|
}
|
||||||
|
|
||||||
// skip to the next multiple of 32 bytes
|
// skip to the next multiple of 32 bytes
|
||||||
|
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
||||||
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
||||||
|
}
|
||||||
|
|
||||||
tensor.file_off = file.tell();
|
tensor.file_off = file.tell();
|
||||||
tensor.name = name;
|
tensor.name = name;
|
||||||
|
@ -847,7 +849,7 @@ struct llama_context_params llama_context_default_params() {
|
||||||
/*.n_batch =*/ 512,
|
/*.n_batch =*/ 512,
|
||||||
/*.gpu_layers =*/ 0,
|
/*.gpu_layers =*/ 0,
|
||||||
/*.main_gpu =*/ 0,
|
/*.main_gpu =*/ 0,
|
||||||
/*.tensor_split =*/ {0},
|
/*.tensor_split =*/ nullptr,
|
||||||
/*.rope_freq_base =*/ 10000.0f,
|
/*.rope_freq_base =*/ 10000.0f,
|
||||||
/*.rope_freq_scale =*/ 1.0f,
|
/*.rope_freq_scale =*/ 1.0f,
|
||||||
/*.progress_callback =*/ nullptr,
|
/*.progress_callback =*/ nullptr,
|
||||||
|
@ -1287,7 +1289,7 @@ static bool llama_model_load(
|
||||||
int n_batch,
|
int n_batch,
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
int main_gpu,
|
int main_gpu,
|
||||||
float * tensor_split,
|
const float * tensor_split,
|
||||||
float rope_freq_base,
|
float rope_freq_base,
|
||||||
float rope_freq_scale,
|
float rope_freq_scale,
|
||||||
bool low_vram,
|
bool low_vram,
|
||||||
|
|
3
llama.h
3
llama.h
|
@ -88,7 +88,8 @@ extern "C" {
|
||||||
int32_t n_batch; // prompt processing batch size
|
int32_t n_batch; // prompt processing batch size
|
||||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
|
||||||
|
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
||||||
|
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||||
float rope_freq_base; // RoPE base frequency
|
float rope_freq_base; // RoPE base frequency
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue