Merge branch 'master' into master

This commit is contained in:
Georgi Gerganov 2023-07-21 13:52:25 +03:00 committed by GitHub
commit eef66e1d2e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 617 additions and 523 deletions

10
.gitignore vendored
View file

@ -62,6 +62,16 @@ perf-*.txt
examples/jeopardy/results.txt examples/jeopardy/results.txt
pyproject.toml pyproject.toml
poetry.lock poetry.lock
poetry.toml poetry.toml
# Test binaries
tests/test-double-float
tests/test-grad0
tests/test-opt
tests/test-quantize-fns
tests/test-quantize-perf
tests/test-sampling
tests/test-tokenizer-0

View file

@ -186,16 +186,7 @@ if (LLAMA_BLAS)
pkg_check_modules(DepBLAS REQUIRED flexiblas_api) pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel") elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
# all Intel* libraries share the same include path # all Intel* libraries share the same include path
pkg_check_modules(DepBLAS mkl-sdl) pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
if (NOT DepBLAS)
if (BUILD_SHARED_LIBS)
set(LINK_METHOD dynamic)
else()
set(LINK_METHOD static)
endif()
string(REGEX REPLACE ".*_" "" DATA_TYPE_MODEL ${LLAMA_BLAS_VENDOR})
pkg_check_modules(DepBLAS REQUIRED mkl-${LINK_METHOD}-${DATA_TYPE_MODEL}-iomp)
endif()
elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC") elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
# this doesn't provide pkg-config # this doesn't provide pkg-config
# suggest to assign BLAS_INCLUDE_DIRS on your own # suggest to assign BLAS_INCLUDE_DIRS on your own

View file

@ -1,5 +1,8 @@
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server libembdinput.so embd-input-test BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test
# Binaries only useful for tests
TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
default: $(BUILD_TARGETS) default: $(BUILD_TARGETS)
@ -90,6 +93,28 @@ ifeq ($(UNAME_S),Haiku)
CXXFLAGS += -pthread CXXFLAGS += -pthread
endif endif
# detect Windows
ifneq ($(findstring _NT,$(UNAME_S)),)
_WIN32 := 1
endif
# library name prefix
ifneq ($(_WIN32),1)
LIB_PRE := lib
endif
# Dynamic Shared Object extension
ifneq ($(_WIN32),1)
DSO_EXT := .so
else
DSO_EXT := .dll
endif
# Windows Sockets 2 (Winsock) for network-capable apps
ifeq ($(_WIN32),1)
LWINSOCK2 := -lws2_32
endif
ifdef LLAMA_GPROF ifdef LLAMA_GPROF
CFLAGS += -pg CFLAGS += -pg
CXXFLAGS += -pg CXXFLAGS += -pg
@ -168,8 +193,12 @@ ifdef LLAMA_CUBLAS
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
OBJS += ggml-cuda.o OBJS += ggml-cuda.o
NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-compiler NVCCFLAGS = --forward-unknown-to-host-compiler
ifdef LLAMA_CUDA_NVCC
NVCC = $(LLAMA_CUDA_NVCC)
else
NVCC = nvcc
endif #LLAMA_CUDA_NVCC
ifdef CUDA_DOCKER_ARCH ifdef CUDA_DOCKER_ARCH
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
else else
@ -198,7 +227,9 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
else else
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
endif endif
ifdef LLAMA_CUDA_CCBIN
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@ $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS endif # LLAMA_CUBLAS
@ -294,7 +325,7 @@ libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean: clean:
rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS)
# #
# Examples # Examples
@ -325,14 +356,14 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS) server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
libembdinput.so: examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
embd-input-test: libembdinput.so examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS) embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput $(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS) train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@ -349,6 +380,8 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
# Tests # Tests
# #
tests: $(TEST_TARGETS)
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS) benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
./$@ ./$@
@ -356,6 +389,23 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
.PHONY: tests clean tests/test-double-float: tests/test-double-float.c build-info.h ggml.o llama.o common.o $(OBJS)
tests: $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
bash ./tests/run-tests.sh
tests/test-grad0: tests/test-grad0.c build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-opt: tests/test-opt.c build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-0: tests/test-tokenizer-0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

View file

@ -360,7 +360,7 @@ Building the program with BLAS support may lead to some performance improvements
```bash ```bash
mkdir build mkdir build
cd build cd build
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_lp64 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
cmake --build . --config Release cmake --build . --config Release
``` ```

View file

@ -243,7 +243,7 @@ function gg_sum_open_llama_3b_v2 {
if [ -z $GG_BUILD_LOW_PERF ]; then if [ -z $GG_BUILD_LOW_PERF ]; then
rm -rf ${SRC}/models-mnt rm -rf ${SRC}/models-mnt
mnt_models=$(realpath ${MNT}/models) mnt_models=${MNT}/models
mkdir -p ${mnt_models} mkdir -p ${mnt_models}
ln -sfn ${mnt_models} ${SRC}/models-mnt ln -sfn ${mnt_models} ${SRC}/models-mnt

View file

@ -2,21 +2,21 @@
set -e set -e
AI_NAME="${AI_NAME:-Miku}" AI_NAME="${AI_NAME:-Miku}"
MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}" MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
USER_NAME="${USER_NAME:-Anon}" USER_NAME="${USER_NAME:-Anon}"
# Uncomment and adjust to the number of CPU cores you want to use. # Uncomment and adjust to the number of CPU cores you want to use.
#N_THREAD="${N_THREAD:-4}" #N_THREAD="${N_THREAD:-4}"
CTX_SIZE="${CTX_SIZE:-4096}"
N_PREDICTS="${N_PREDICTS:-4096}" N_PREDICTS="${N_PREDICTS:-4096}"
GEN_OPTIONS=(--batch_size 1024 GEN_OPTIONS=(--batch_size 1024
--ctx_size 2048 --ctx_size "$CTX_SIZE"
--keep -1 --keep -1
--repeat_last_n 256 --repeat_last_n 256
--repeat_penalty 1.17647 --repeat_penalty 1.17647
--temp 0.7 --temp 0.6
--top_k 40 --mirostat 2)
--top_p 0.5)
if [ -n "$N_THREAD" ]; then if [ -n "$N_THREAD" ]; then
GEN_OPTIONS+=(--threads "$N_THREAD") GEN_OPTIONS+=(--threads "$N_THREAD")
@ -24,16 +24,17 @@ fi
./main "${GEN_OPTIONS[@]}" \ ./main "${GEN_OPTIONS[@]}" \
--model "$MODEL" \ --model "$MODEL" \
--in-prefix " " \
--in-suffix "${AI_NAME}:" \
--n_predict "$N_PREDICTS" \ --n_predict "$N_PREDICTS" \
--color --interactive \ --color --interactive \
--reverse-prompt "${USER_NAME}:" \ --reverse-prompt "${USER_NAME}:" \
--prompt " --prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next. ${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help. ${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad. ${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her. ${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
The conversation is only between ${USER_NAME} and ${AI_NAME} The conversation is only between ${USER_NAME} and ${AI_NAME}.
The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice. The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
${AI_NAME} can only communicate through text, so she can't send images or videos. ${AI_NAME} can only communicate through text, so she can't send images or videos.

View file

@ -586,7 +586,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
lparams.n_batch = params.n_batch; lparams.n_batch = params.n_batch;
lparams.n_gpu_layers = params.n_gpu_layers; lparams.n_gpu_layers = params.n_gpu_layers;
lparams.main_gpu = params.main_gpu; lparams.main_gpu = params.main_gpu;
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float)); lparams.tensor_split = params.tensor_split;
lparams.low_vram = params.low_vram; lparams.low_vram = params.low_vram;
lparams.seed = params.seed; lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16; lparams.f16_kv = params.memory_f16;

View file

@ -7,6 +7,9 @@ target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}> SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
) )
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO) if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO) add_dependencies(${TARGET} BUILD_INFO)

View file

@ -6,7 +6,7 @@
outputs = { self, nixpkgs, flake-utils }: outputs = { self, nixpkgs, flake-utils }:
flake-utils.lib.eachDefaultSystem (system: flake-utils.lib.eachDefaultSystem (system:
let let
inherit (pkgs.stdenv) isAarch32 isAarch64 isx86_32 isx86_64 isDarwin; inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
osSpecific = with pkgs; [ openmpi ] ++ osSpecific = with pkgs; [ openmpi ] ++
( (
if isAarch64 && isDarwin then if isAarch64 && isDarwin then
@ -22,14 +22,13 @@
CoreGraphics CoreGraphics
CoreVideo CoreVideo
] ]
else if isx86_32 || isx86_64 then
with pkgs; [ mkl ]
else else
with pkgs; [ openblas ] with pkgs; [ openblas ]
); );
pkgs = import nixpkgs { inherit system; }; pkgs = import nixpkgs { inherit system; };
nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
llama-python = llama-python =
pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]); pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
in { in {
packages.default = pkgs.stdenv.mkDerivation { packages.default = pkgs.stdenv.mkDerivation {
name = "llama.cpp"; name = "llama.cpp";
@ -37,33 +36,21 @@
postPatch = '' postPatch = ''
substituteInPlace ./ggml-metal.m \ substituteInPlace ./ggml-metal.m \
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python'
''; '';
nativeBuildInputs = with pkgs; [ cmake pkgconfig ]; nativeBuildInputs = nativeBuildInputs;
buildInputs = osSpecific; buildInputs = osSpecific;
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ] cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ]
++ (if isAarch64 && isDarwin then [ ++ (if isAarch64 && isDarwin then [
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
"-DLLAMA_METAL=ON" "-DLLAMA_METAL=ON"
] else if isx86_32 || isx86_64 then [
"-DLLAMA_BLAS=ON"
"-DLLAMA_BLAS_VENDOR=Intel10_lp64"
] else [ ] else [
"-DLLAMA_BLAS=ON" "-DLLAMA_BLAS=ON"
"-DLLAMA_BLAS_VENDOR=OpenBLAS" "-DLLAMA_BLAS_VENDOR=OpenBLAS"
]); ]);
installPhase = '' postInstall = ''
runHook preInstall
install -D bin/* -t $out/bin
install -Dm644 lib*.so -t $out/lib
mv $out/bin/main $out/bin/llama mv $out/bin/main $out/bin/llama
mv $out/bin/server $out/bin/llama-server mv $out/bin/server $out/bin/llama-server
echo "#!${llama-python}/bin/python" > $out/bin/convert.py
cat ${./convert.py} >> $out/bin/convert.py
chmod +x $out/bin/convert.py
runHook postInstall
''; '';
meta.mainProgram = "llama"; meta.mainProgram = "llama";
}; };
@ -81,7 +68,7 @@
}; };
apps.default = self.apps.${system}.llama; apps.default = self.apps.${system}.llama;
devShells.default = pkgs.mkShell { devShells.default = pkgs.mkShell {
packages = with pkgs; [ cmake llama-python ] ++ osSpecific; packages = nativeBuildInputs ++ osSpecific;
}; };
}); });
} }

View file

@ -2512,6 +2512,9 @@ void ggml_init_cublas() {
} }
void ggml_cuda_set_tensor_split(const float * tensor_split) { void ggml_cuda_set_tensor_split(const float * tensor_split) {
if (tensor_split == nullptr) {
return;
}
bool all_zero = true; bool all_zero = true;
for (int i = 0; i < g_device_count; ++i) { for (int i = 0; i < g_device_count; ++i) {
if (tensor_split[i] != 0.0f) { if (tensor_split[i] != 0.0f) {

View file

@ -676,8 +676,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 4; nth0 = 2;
nth1 = 16; nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
} break; } break;
case GGML_TYPE_Q3_K: case GGML_TYPE_Q3_K:
@ -694,8 +694,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 4; nth0 = 2;
nth1 = 16; nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
} break; } break;
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
@ -703,8 +703,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 4; nth0 = 2;
nth1 = 16; nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
} break; } break;
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
@ -712,8 +712,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 4; nth0 = 2;
nth1 = 16; nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
} break; } break;
default: default:
@ -739,14 +739,17 @@ void ggml_metal_graph_compute(
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) { if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_Q2_K || else if (src0t == GGML_TYPE_Q5_K) {
src0t == GGML_TYPE_Q3_K || [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
src0t == GGML_TYPE_Q4_K || }
src0t == GGML_TYPE_Q5_K || else if (src0t == GGML_TYPE_Q6_K) {
src0t == GGML_TYPE_Q6_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_Q3_K) {
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else { } else {
@ -792,7 +795,7 @@ void ggml_metal_graph_compute(
const float eps = 1e-6f; const float eps = 1e-6f;
const int nth = 256; const int nth = 512;
[encoder setComputePipelineState:ctx->pipeline_rms_norm]; [encoder setComputePipelineState:ctx->pipeline_rms_norm];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@ -800,7 +803,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
[encoder setBytes:&eps length:sizeof( float) atIndex:4]; [encoder setBytes:&eps length:sizeof( float) atIndex:4];
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
const int64_t nrows = ggml_nrows(src0); const int64_t nrows = ggml_nrows(src0);

File diff suppressed because it is too large Load diff

View file

@ -555,7 +555,9 @@ struct llama_file_loader {
} }
// skip to the next multiple of 32 bytes // skip to the next multiple of 32 bytes
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR); file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
}
tensor.file_off = file.tell(); tensor.file_off = file.tell();
tensor.name = name; tensor.name = name;
@ -847,7 +849,7 @@ struct llama_context_params llama_context_default_params() {
/*.n_batch =*/ 512, /*.n_batch =*/ 512,
/*.gpu_layers =*/ 0, /*.gpu_layers =*/ 0,
/*.main_gpu =*/ 0, /*.main_gpu =*/ 0,
/*.tensor_split =*/ {0}, /*.tensor_split =*/ nullptr,
/*.rope_freq_base =*/ 10000.0f, /*.rope_freq_base =*/ 10000.0f,
/*.rope_freq_scale =*/ 1.0f, /*.rope_freq_scale =*/ 1.0f,
/*.progress_callback =*/ nullptr, /*.progress_callback =*/ nullptr,
@ -1287,7 +1289,7 @@ static bool llama_model_load(
int n_batch, int n_batch,
int n_gpu_layers, int n_gpu_layers,
int main_gpu, int main_gpu,
float * tensor_split, const float * tensor_split,
float rope_freq_base, float rope_freq_base,
float rope_freq_scale, float rope_freq_scale,
bool low_vram, bool low_vram,

View file

@ -88,7 +88,8 @@ extern "C" {
int32_t n_batch; // prompt processing batch size int32_t n_batch; // prompt processing batch size
int32_t n_gpu_layers; // number of layers to store in VRAM int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t main_gpu; // the GPU that is used for scratch and small tensors int32_t main_gpu; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
// ref: https://github.com/ggerganov/llama.cpp/pull/2054 // ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency float rope_freq_base; // RoPE base frequency