Merge remote-tracking branch 'origin' into add_gpt2_support

This commit is contained in:
EC2 Default User 2023-12-15 13:26:09 +00:00
commit 29e3645501
42 changed files with 4481 additions and 983 deletions

View file

@ -15,6 +15,9 @@ indent_size = 4
[Makefile] [Makefile]
indent_style = tab indent_style = tab
[scripts/*.mk]
indent_style = tab
[prompts/*.txt] [prompts/*.txt]
insert_final_newline = unset insert_final_newline = unset

View file

@ -397,57 +397,102 @@ if (LLAMA_HIPBLAS)
endif() endif()
endif() endif()
if (LLAMA_ALL_WARNINGS) function(get_flags CCID CCVER)
if (NOT MSVC) set(C_FLAGS "")
set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) set(CXX_FLAGS "")
set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
set(host_cxx_flags "")
if (CMAKE_C_COMPILER_ID MATCHES "Clang") if (CCID MATCHES "Clang")
set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return) set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi) set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
if ( if (
(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
(CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3.0) (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
) )
set(c_flags ${c_flags} -Wdouble-promotion) set(C_FLAGS ${C_FLAGS} -Wdouble-promotion)
endif() endif()
elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU") elseif (CCID STREQUAL "GNU")
set(c_flags ${c_flags} -Wdouble-promotion) set(C_FLAGS -Wdouble-promotion)
set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds) set(CXX_FLAGS -Wno-array-bounds)
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0) if (CCVER VERSION_GREATER_EQUAL 7.1.0)
set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation) set(CXX_FLAGS ${CXX_FLAGS} -Wno-format-truncation)
endif() endif()
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0) if (CCVER VERSION_GREATER_EQUAL 8.1.0)
set(host_cxx_flags ${host_cxx_flags} -Wextra-semi) set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
endif() endif()
endif() endif()
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
endfunction()
if (LLAMA_ALL_WARNINGS)
if (NOT MSVC)
set(WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
set(C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
-Werror=implicit-int -Werror=implicit-function-declaration)
set(CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
set(C_FLAGS ${WARNING_FLAGS} ${C_FLAGS})
set(CXX_FLAGS ${WARNING_FLAGS} ${CXX_FLAGS})
get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
"$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
else() else()
# todo : msvc # todo : msvc
set(C_FLAGS "")
set(CXX_FLAGS "")
endif()
endif() endif()
set(c_flags ${c_flags} ${warning_flags}) if (LLAMA_CUBLAS)
set(cxx_flags ${cxx_flags} ${warning_flags}) set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
"$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
endif()
if (NOT MSVC) if (NOT MSVC)
set(cuda_flags -Wno-pedantic) set(CUDA_FLAGS ${CUDA_FLAGS} -Wno-pedantic)
endif()
set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
list(JOIN host_cxx_flags " " cuda_host_flags) # pass host compiler flags as a single argument
if (NOT cuda_host_flags STREQUAL "")
set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
endif() endif()
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>") if (LLAMA_ALL_WARNINGS AND NOT MSVC)
set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
set(NVCC_CMD ${NVCC_CMD} -ccbin ${CMAKE_CUDA_HOST_COMPILER})
endif()
execute_process(
COMMAND ${NVCC_CMD} -Xcompiler --version
OUTPUT_VARIABLE CUDA_CCFULLVER
ERROR_QUIET
)
if (NOT CUDA_CCFULLVER MATCHES clang)
set(CUDA_CCID "GNU")
execute_process(
COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
OUTPUT_VARIABLE CUDA_CCVER
ERROR_QUIET
)
else()
if (CUDA_CCFULLVER MATCHES Apple)
set(CUDA_CCID "AppleClang")
else()
set(CUDA_CCID "Clang")
endif()
string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
endif()
message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
get_flags(${CUDA_CCID} ${CUDA_CCVER})
list(JOIN GF_CXX_FLAGS " " CUDA_CXX_FLAGS) # pass host compiler flags as a single argument
if (NOT CUDA_CXX_FLAGS STREQUAL "")
set(CUDA_FLAGS ${CUDA_FLAGS} -Xcompiler ${CUDA_CXX_FLAGS})
endif()
endif()
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
endif()
if (WIN32) if (WIN32)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS) add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
@ -471,6 +516,7 @@ endif()
execute_process( execute_process(
COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
ERROR_VARIABLE output ERROR_VARIABLE output
OUTPUT_QUIET
) )
if (output MATCHES "dyld-1015\.7") if (output MATCHES "dyld-1015\.7")
add_compile_definitions(HAVE_BUGGY_APPLE_LINKER) add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
@ -593,6 +639,11 @@ else()
message(STATUS "Unknown architecture") message(STATUS "Unknown architecture")
endif() endif()
if (MINGW)
# Target Windows 8 for PrefetchVirtualMemory
add_compile_definitions(_WIN32_WINNT=0x602)
endif()
# #
# POSIX conformance # POSIX conformance
# #

124
Makefile
View file

@ -26,20 +26,6 @@ ifndef UNAME_M
UNAME_M := $(shell uname -m) UNAME_M := $(shell uname -m)
endif endif
ifeq '' '$(findstring clang,$(shell $(CC) --version))'
CC_IS_GCC=1
CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
else
CC_IS_CLANG=1
ifeq '' '$(findstring Apple,$(shell $(CC) --version))'
CC_IS_LLVM_CLANG=1
else
CC_IS_APPLE_CLANG=1
endif
CC_VER := $(shell $(CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
| awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
endif
# Mac OS + Arm can report x86_64 # Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
@ -122,8 +108,8 @@ MK_CXXFLAGS = -std=c++11 -fPIC
# -Ofast tends to produce faster code, but may not be available for some compilers. # -Ofast tends to produce faster code, but may not be available for some compilers.
ifdef LLAMA_FAST ifdef LLAMA_FAST
MK_CFLAGS += -Ofast MK_CFLAGS += -Ofast
MK_HOST_CXXFLAGS += -Ofast HOST_CXXFLAGS += -Ofast
MK_CUDA_CXXFLAGS += -O3 MK_NVCCFLAGS += -O3
else else
MK_CFLAGS += -O3 MK_CFLAGS += -O3
MK_CXXFLAGS += -O3 MK_CXXFLAGS += -O3
@ -220,30 +206,6 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
-Werror=implicit-function-declaration -Werror=implicit-function-declaration
MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
ifeq ($(CC_IS_CLANG), 1)
# clang options
MK_CFLAGS += -Wunreachable-code-break -Wunreachable-code-return
MK_HOST_CXXFLAGS += -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
ifneq '' '$(and $(CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 030800)))'
MK_CFLAGS += -Wdouble-promotion
endif
ifneq '' '$(and $(CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 070300)))'
MK_CFLAGS += -Wdouble-promotion
endif
else
# gcc options
MK_CFLAGS += -Wdouble-promotion
MK_HOST_CXXFLAGS += -Wno-array-bounds
ifeq ($(shell expr $(CC_VER) \>= 070100), 1)
MK_HOST_CXXFLAGS += -Wno-format-truncation
endif
ifeq ($(shell expr $(CC_VER) \>= 080100), 1)
MK_HOST_CXXFLAGS += -Wextra-semi
endif
endif
# this version of Apple ld64 is buggy # this version of Apple ld64 is buggy
ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))' ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
@ -295,7 +257,7 @@ ifndef RISCV
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
# Use all CPU extensions that are available: # Use all CPU extensions that are available:
MK_CFLAGS += -march=native -mtune=native MK_CFLAGS += -march=native -mtune=native
MK_HOST_CXXFLAGS += -march=native -mtune=native HOST_CXXFLAGS += -march=native -mtune=native
# Usage AVX-only # Usage AVX-only
#MK_CFLAGS += -mfma -mf16c -mavx #MK_CFLAGS += -mfma -mf16c -mavx
@ -306,12 +268,15 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
#MK_CXXFLAGS += -mssse3 #MK_CXXFLAGS += -mssse3
endif endif
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves. # The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
# https://github.com/ggerganov/llama.cpp/issues/2922 # https://github.com/ggerganov/llama.cpp/issues/2922
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
MK_CFLAGS += -Xassembler -muse-unaligned-vector-move MK_CFLAGS += -Xassembler -muse-unaligned-vector-move
MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
# Target Windows 8 for PrefetchVirtualMemory
MK_CPPFLAGS += -D_WIN32_WINNT=0x602
endif endif
ifneq ($(filter aarch64%,$(UNAME_M)),) ifneq ($(filter aarch64%,$(UNAME_M)),)
@ -395,61 +360,64 @@ ifdef LLAMA_CUBLAS
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
OBJS += ggml-cuda.o OBJS += ggml-cuda.o
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math MK_NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
ifdef LLAMA_DEBUG
MK_NVCCFLAGS += -lineinfo
endif
ifdef LLAMA_CUDA_NVCC ifdef LLAMA_CUDA_NVCC
NVCC = $(LLAMA_CUDA_NVCC) NVCC = $(LLAMA_CUDA_NVCC)
else else
NVCC = nvcc NVCC = nvcc
endif #LLAMA_CUDA_NVCC endif #LLAMA_CUDA_NVCC
ifdef CUDA_DOCKER_ARCH ifdef CUDA_DOCKER_ARCH
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
else ifdef CUDA_POWER_ARCH else ifndef CUDA_POWER_ARCH
NVCCFLAGS += MK_NVCCFLAGS += -arch=native
else
NVCCFLAGS += -arch=native
endif # CUDA_DOCKER_ARCH endif # CUDA_DOCKER_ARCH
ifdef LLAMA_CUDA_FORCE_DMMV ifdef LLAMA_CUDA_FORCE_DMMV
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # LLAMA_CUDA_FORCE_DMMV endif # LLAMA_CUDA_FORCE_DMMV
ifdef LLAMA_CUDA_FORCE_MMQ ifdef LLAMA_CUDA_FORCE_MMQ
NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
endif # LLAMA_CUDA_FORCE_MMQ endif # LLAMA_CUDA_FORCE_MMQ
ifdef LLAMA_CUDA_DMMV_X ifdef LLAMA_CUDA_DMMV_X
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
else else
NVCCFLAGS += -DGGML_CUDA_DMMV_X=32 MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
endif # LLAMA_CUDA_DMMV_X endif # LLAMA_CUDA_DMMV_X
ifdef LLAMA_CUDA_MMV_Y ifdef LLAMA_CUDA_MMV_Y
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
else ifdef LLAMA_CUDA_DMMV_Y else ifdef LLAMA_CUDA_DMMV_Y
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
else else
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1 MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
endif # LLAMA_CUDA_MMV_Y endif # LLAMA_CUDA_MMV_Y
ifdef LLAMA_CUDA_F16 ifdef LLAMA_CUDA_F16
NVCCFLAGS += -DGGML_CUDA_F16 MK_NVCCFLAGS += -DGGML_CUDA_F16
endif # LLAMA_CUDA_F16 endif # LLAMA_CUDA_F16
ifdef LLAMA_CUDA_DMMV_F16 ifdef LLAMA_CUDA_DMMV_F16
NVCCFLAGS += -DGGML_CUDA_F16 MK_NVCCFLAGS += -DGGML_CUDA_F16
endif # LLAMA_CUDA_DMMV_F16 endif # LLAMA_CUDA_DMMV_F16
ifdef LLAMA_CUDA_KQUANTS_ITER ifdef LLAMA_CUDA_KQUANTS_ITER
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
else else
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
endif endif
ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE) MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
else else
NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
#ifdef LLAMA_CUDA_CUBLAS #ifdef LLAMA_CUDA_CUBLAS
# NVCCFLAGS += -DGGML_CUDA_CUBLAS # MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
#endif # LLAMA_CUDA_CUBLAS #endif # LLAMA_CUDA_CUBLAS
ifdef LLAMA_CUDA_CCBIN ifdef LLAMA_CUDA_CCBIN
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) -c $< -o $@ $(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endif # LLAMA_CUBLAS endif # LLAMA_CUBLAS
ifdef LLAMA_CLBLAST ifdef LLAMA_CLBLAST
@ -511,16 +479,22 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI endif # LLAMA_MPI
GF_CC := $(CC)
include scripts/get-flags.mk
# combine build flags with cmdline overrides # combine build flags with cmdline overrides
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS) override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS) BASE_CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS) $(CUDA_CXXFLAGS) override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS)
override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS) $(HOST_CXXFLAGS) override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
# save CXXFLAGS before we add host-only options # identify CUDA host compiler
NVCCFLAGS := $(NVCCFLAGS) $(CXXFLAGS) $(CUDA_CXXFLAGS) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS)" ifdef LLAMA_CUBLAS
override CXXFLAGS += $(HOST_CXXFLAGS) GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
include scripts/get-flags.mk
CUDA_CXXFLAGS := $(GF_CXXFLAGS)
endif
# #
# Print build information # Print build information
@ -730,16 +704,16 @@ tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS) tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS) tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)

View file

@ -10,6 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
### Hot topics ### Hot topics
- Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
- **llama.h API change for handling KV cache offloading and data type: https://github.com/ggerganov/llama.cpp/pull/4309** - **llama.h API change for handling KV cache offloading and data type: https://github.com/ggerganov/llama.cpp/pull/4309**
- Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225 - Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
@ -96,7 +97,18 @@ as the main playground for developing new features for the [ggml](https://github
- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410) - [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417) - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553) - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
- [X] [StableLM-3b-4e1t](https://github.com/ggerganov/llama.cpp/pull/3586) - [X] [StableLM-3b-4e1t](https://github.com/ggerganov/llama.cpp/pull/3586)
- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
**Multimodal models:**
- [x] [Llava 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e)
- [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
**Bindings:** **Bindings:**

View file

@ -656,6 +656,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
} else if (arg == "-h" || arg == "--help") { } else if (arg == "-h" || arg == "--help") {
return false; return false;
} else if (arg == "--version") {
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
exit(0);
} else if (arg == "--random-prompt") { } else if (arg == "--random-prompt") {
params.random_prompt = true; params.random_prompt = true;
} else if (arg == "--in-prefix-bos") { } else if (arg == "--in-prefix-bos") {
@ -794,6 +798,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf("\n"); printf("\n");
printf("options:\n"); printf("options:\n");
printf(" -h, --help show this help message and exit\n"); printf(" -h, --help show this help message and exit\n");
printf(" --version show version and build info\n");
printf(" -i, --interactive run in interactive mode\n"); printf(" -i, --interactive run in interactive mode\n");
printf(" --interactive-first run in interactive mode and wait for input right away\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n");
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");

View file

@ -61,13 +61,13 @@
// #define LOG_TARGET stderr // #define LOG_TARGET stderr
// #include "log.h" // #include "log.h"
// //
// The log target can also be redirected to a diffrent function // The log target can also be redirected to a different function
// like so: // like so:
// //
// #define LOG_TARGET log_handler_diffrent() // #define LOG_TARGET log_handler_different()
// #include "log.h" // #include "log.h"
// //
// FILE* log_handler_diffrent() // FILE* log_handler_different()
// { // {
// return stderr; // return stderr;
// } // }
@ -421,7 +421,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriS
// Disables logs entirely at runtime. // Disables logs entirely at runtime.
// Makes LOG() and LOG_TEE() produce no output, // Makes LOG() and LOG_TEE() produce no output,
// untill enabled back. // until enabled back.
#define log_disable() log_disable_impl() #define log_disable() log_disable_impl()
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE

View file

@ -71,7 +71,7 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd)
struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
float scale = 1.0f; // xavier float scale = 1.0f; // xavier
switch (tensor->n_dims) { switch (ggml_n_dims(tensor)) {
case 1: case 1:
scale /= sqrtf((float) tensor->ne[0]); scale /= sqrtf((float) tensor->ne[0]);
for (int i0 = 0; i0 < tensor->ne[0]; i0++) { for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
@ -119,7 +119,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct
} }
struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) { struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
switch (tensor->n_dims) { switch (ggml_n_dims(tensor)) {
case 1: case 1:
for (int i0 = 0; i0 < tensor->ne[0]; i0++) { for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
@ -183,25 +183,27 @@ float fclamp(const float v, const float min, const float max) {
} }
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
GGML_ASSERT(tensor->n_dims == 1);
GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[0] == ne0);
GGML_ASSERT(tensor->ne[1] == 1);
GGML_ASSERT(tensor->ne[2] == 1);
GGML_ASSERT(tensor->ne[3] == 1);
} }
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
GGML_ASSERT(tensor->n_dims == 2);
GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[0] == ne0);
GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[1] == ne1);
GGML_ASSERT(tensor->ne[2] == 1);
GGML_ASSERT(tensor->ne[3] == 1);
} }
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
GGML_ASSERT(tensor->n_dims == 3);
GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[0] == ne0);
GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[1] == ne1);
GGML_ASSERT(tensor->ne[2] == ne2); GGML_ASSERT(tensor->ne[2] == ne2);
GGML_ASSERT(tensor->ne[3] == 1);
} }
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
GGML_ASSERT(tensor->n_dims == 4);
GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[0] == ne0);
GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[1] == ne1);
GGML_ASSERT(tensor->ne[2] == ne2); GGML_ASSERT(tensor->ne[2] == ne2);
@ -225,8 +227,8 @@ int64_t get_example_targets_batch(
bool sample_random_offsets bool sample_random_offsets
) { ) {
GGML_ASSERT(samples_count > 0); GGML_ASSERT(samples_count > 0);
GGML_ASSERT(tokens_input->n_dims == 2); GGML_ASSERT(ggml_is_matrix(tokens_input));
GGML_ASSERT(target_probs->n_dims == 3); GGML_ASSERT(ggml_is_3d(target_probs));
int64_t n_vocab = target_probs->ne[0]; int64_t n_vocab = target_probs->ne[0];
int64_t n_tokens = tokens_input->ne[0]; int64_t n_tokens = tokens_input->ne[0];
int64_t n_batch = tokens_input->ne[1]; int64_t n_batch = tokens_input->ne[1];

View file

@ -77,8 +77,18 @@ class Model:
self.gguf_writer.add_embedding_length(n_embd) self.gguf_writer.add_embedding_length(n_embd)
if (n_ff := self.hparams.get("intermediate_size")) is not None: if (n_ff := self.hparams.get("intermediate_size")) is not None:
self.gguf_writer.add_feed_forward_length(n_ff) self.gguf_writer.add_feed_forward_length(n_ff)
if (n_head := self.hparams.get("num_attention_head")) is not None: if (n_head := self.hparams.get("num_attention_heads")) is not None:
self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count(n_head)
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
self.gguf_writer.add_head_count_kv(n_head_kv)
if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps)
if (n_experts := self.hparams.get("num_local_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used)
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
def write_tensors(self): def write_tensors(self):
@ -170,6 +180,8 @@ class Model:
return StableLMModel return StableLMModel
if model_architecture == "QWenLMHeadModel": if model_architecture == "QWenLMHeadModel":
return QwenModel return QwenModel
if model_architecture == "MixtralForCausalLM":
return MixtralModel
return Model return Model
def _is_model_safetensors(self) -> bool: def _is_model_safetensors(self) -> bool:
@ -207,6 +219,8 @@ class Model:
return gguf.MODEL_ARCH.STABLELM return gguf.MODEL_ARCH.STABLELM
if arch == "QWenLMHeadModel": if arch == "QWenLMHeadModel":
return gguf.MODEL_ARCH.QWEN return gguf.MODEL_ARCH.QWEN
if arch == "MixtralForCausalLM":
return gguf.MODEL_ARCH.LLAMA
raise NotImplementedError(f'Architecture "{arch}" not supported!') raise NotImplementedError(f'Architecture "{arch}" not supported!')
@ -837,6 +851,11 @@ class StableLMModel(Model):
self.gguf_writer.add_layer_norm_eps(1e-5) self.gguf_writer.add_layer_norm_eps(1e-5)
class MixtralModel(Model):
def set_vocab(self):
self._set_vocab_sentencepiece()
class QwenModel(Model): class QwenModel(Model):
@staticmethod @staticmethod
def token_bytes_to_string(b): def token_bytes_to_string(b):

View file

@ -10,6 +10,7 @@ import itertools
import json import json
import math import math
import mmap import mmap
import os
import pickle import pickle
import re import re
import signal import signal
@ -18,15 +19,15 @@ import sys
import time import time
import zipfile import zipfile
from abc import ABCMeta, abstractmethod from abc import ABCMeta, abstractmethod
from collections import OrderedDict
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, Optional, TypeVar, cast
import numpy as np import numpy as np
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
import os
if 'NO_LOCAL_GGUF' not in os.environ: if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf import gguf
@ -42,6 +43,7 @@ NDArray: TypeAlias = 'np.ndarray[Any, Any]'
ARCH = gguf.MODEL_ARCH.LLAMA ARCH = gguf.MODEL_ARCH.LLAMA
DEFAULT_CONCURRENCY = 8 DEFAULT_CONCURRENCY = 8
# #
# data types # data types
# #
@ -158,7 +160,9 @@ class Params:
n_ff: int n_ff: int
n_head: int n_head: int
n_head_kv: int n_head_kv: int
f_norm_eps: float n_experts: int | None = None
n_experts_used: int | None = None
f_norm_eps: float | None = None
rope_scaling_type: gguf.RopeScalingType | None = None rope_scaling_type: gguf.RopeScalingType | None = None
f_rope_freq_base: float | None = None f_rope_freq_base: float | None = None
@ -233,6 +237,13 @@ class Params:
raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n" raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
"Suggestion: provide 'config.json' of the model in the same directory containing model files.") "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
n_experts = None
n_experts_used = None
if "num_local_experts" in config:
n_experts = config["num_local_experts"]
n_experts_used = config["num_experts_per_tok"]
return Params( return Params(
n_vocab = config["vocab_size"], n_vocab = config["vocab_size"],
n_embd = config["hidden_size"], n_embd = config["hidden_size"],
@ -241,6 +252,8 @@ class Params:
n_ff = config["intermediate_size"], n_ff = config["intermediate_size"],
n_head = (n_head := config["num_attention_heads"]), n_head = (n_head := config["num_attention_heads"]),
n_head_kv = config.get("num_key_value_heads", n_head), n_head_kv = config.get("num_key_value_heads", n_head),
n_experts = n_experts,
n_experts_used = n_experts_used,
f_norm_eps = config["rms_norm_eps"], f_norm_eps = config["rms_norm_eps"],
f_rope_freq_base = config.get("rope_theta"), f_rope_freq_base = config.get("rope_theta"),
rope_scaling_type = rope_scaling_type, rope_scaling_type = rope_scaling_type,
@ -255,8 +268,15 @@ class Params:
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path)) config = json.load(open(config_path))
n_experts = None
n_experts_used = None
f_rope_freq_base = None
# hack to determine LLaMA v1 vs v2 vs CodeLlama # hack to determine LLaMA v1 vs v2 vs CodeLlama
if config.get("rope_theta") == 1000000: if config.get("moe"):
# Mixtral
n_ctx = 32768
elif config.get("rope_theta") == 1000000:
# CodeLlama # CodeLlama
n_ctx = 16384 n_ctx = 16384
elif config["norm_eps"] == 1e-05: elif config["norm_eps"] == 1e-05:
@ -266,16 +286,27 @@ class Params:
# LLaMA v1 # LLaMA v1
n_ctx = 2048 n_ctx = 2048
if "layers.0.feed_forward.w1.weight" in model:
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
if config.get("moe"):
n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
n_experts = config["moe"]["num_experts"]
n_experts_used = config["moe"]["num_experts_per_tok"]
f_rope_freq_base = 1e6
return Params( return Params(
n_vocab = model["tok_embeddings.weight"].shape[0], n_vocab = model["tok_embeddings.weight"].shape[0],
n_embd = config["dim"], n_embd = config["dim"],
n_layer = config["n_layers"], n_layer = config["n_layers"],
n_ctx = n_ctx, n_ctx = n_ctx,
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0], n_ff = n_ff,
n_head = (n_head := config["n_heads"]), n_head = (n_head := config["n_heads"]),
n_head_kv = config.get("n_kv_heads", n_head), n_head_kv = config.get("n_kv_heads", n_head),
n_experts = n_experts,
n_experts_used = n_experts_used,
f_norm_eps = config["norm_eps"], f_norm_eps = config["norm_eps"],
f_rope_freq_base = config.get("rope_theta"), f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
) )
@staticmethod @staticmethod
@ -297,127 +328,138 @@ class Params:
return params return params
# class VocabLoader:
# vocab def __init__(self, params: Params, fname_tokenizer: Path) -> None:
# try:
from transformers import AutoTokenizer
except ImportError as e:
raise ImportError(
"To use VocabLoader, please install the `transformers` package. "
"You can install it with `pip install transformers`."
) from e
class BpeVocab: try:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), trust_remote_code=True)
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) except ValueError:
added_tokens: dict[str, int] self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True)
if fname_added_tokens is not None:
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. self.added_tokens_dict: OrderedDict[str, int] = OrderedDict()
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
for tok, tokidx in sorted(self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]):
if tokidx >= params.n_vocab or tokidx < self.tokenizer.vocab_size:
continue
self.added_tokens_dict[tok] = tokidx
self.unk_token_id: int = self.tokenizer.unk_token_id
self.specials: dict[str, int] = {
tok: self.tokenizer.get_vocab()[tok]
for tok in self.tokenizer.all_special_tokens
}
self.special_ids: set[int] = set(self.tokenizer.all_special_ids)
self.vocab_size_base: int = self.tokenizer.vocab_size
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict)
self.fname_tokenizer: Path = fname_tokenizer
vocab_file = "tokenizer.model"
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
if path_candidate is not None:
self.spm = SentencePieceProcessor(str(path_candidate))
print(self.spm.vocab_size(), self.vocab_size_base)
else: else:
# Fall back to trying to find the added tokens in tokenizer.json self.spm = None
tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
if not tokenizer_json_file.is_file():
added_tokens = {}
else:
tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
added_tokens = dict(
(item['content'], item['id'])
for item in tokenizer_json.get('added_tokens', [])
# Added tokens here can be duplicates of the main vocabulary.
if item['content'] not in self.bpe_tokenizer)
vocab_size: int = len(self.bpe_tokenizer) def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) tokenizer = self.tokenizer
actual_ids = sorted(added_tokens.values()) reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()}
if expected_ids != actual_ids: added_tokens_ids = set(self.added_tokens_dict.values())
expected_end_id = vocab_size + len(actual_ids) - 1
raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) for i in range(self.vocab_size_base):
self.added_tokens_list = [text for (text, idx) in items] if i in added_tokens_ids:
self.vocab_size_base: int = vocab_size continue
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: text = reverse_vocab[i].encode("utf-8")
tokenizer = self.bpe_tokenizer yield text, self.get_token_score(i), self.get_token_type(i)
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
for i, _ in enumerate(tokenizer):
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list:
score = -1000.0
yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
yield from self.bpe_tokens()
yield from self.added_tokens()
def __repr__(self) -> str:
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
class SentencePieceVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
added_tokens: dict[str, int]
if fname_added_tokens is not None:
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else:
added_tokens = {}
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
actual_new_ids = sorted(new_tokens.keys())
if expected_new_ids != actual_new_ids:
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
# Token pieces that were added to the base vocabulary.
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
self.vocab_size_base = vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
tokenizer = self.sentencepiece_tokenizer
for i in range(tokenizer.vocab_size()):
piece = tokenizer.id_to_piece(i)
text: bytes = piece.encode("utf-8")
score: float = tokenizer.get_score(i)
def get_token_type(self, token_id: int) -> gguf.TokenType:
toktype = gguf.TokenType.NORMAL toktype = gguf.TokenType.NORMAL
if tokenizer.is_unknown(i):
if self.spm is not None and token_id < self.spm.vocab_size():
if self.spm.is_unknown(token_id):
toktype = gguf.TokenType.UNKNOWN toktype = gguf.TokenType.UNKNOWN
if tokenizer.is_control(i): if self.spm.is_control(token_id):
toktype = gguf.TokenType.CONTROL
if self.spm.is_unused(token_id):
toktype = gguf.TokenType.UNUSED
if self.spm.is_byte(token_id):
toktype = gguf.TokenType.BYTE
else:
if token_id == self.unk_token_id:
toktype = gguf.TokenType.UNKNOWN
if token_id in self.special_ids:
toktype = gguf.TokenType.CONTROL toktype = gguf.TokenType.CONTROL
# NOTE: I think added_tokens are user defined. return toktype
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
if tokenizer.is_unused(i): def get_token_score(self, token_id: int) -> float:
toktype = gguf.TokenType.UNUSED if self.spm is not None and token_id < self.spm.vocab_size():
if tokenizer.is_byte(i): return cast(float, self.spm.get_score(token_id))
toktype = gguf.TokenType.BYTE return 0.0
yield text, score, toktype
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list:
for text in self.added_tokens_dict:
if text in self.specials:
toktype = self.get_token_type(self.specials[text])
score = self.get_token_score(self.specials[text])
else:
toktype = gguf.TokenType.USER_DEFINED
score = -1000.0 score = -1000.0
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
yield text.encode("utf-8"), score, toktype
def has_newline_token(self) -> bool:
return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
yield from self.sentencepiece_tokens() yield from self.hf_tokens()
yield from self.added_tokens() yield from self.added_tokens()
def get_vocab_type(self) -> str:
path_candidates = []
vocab_file = "tokenizer.model"
path_candidates.append(vocab_file)
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
if path_candidate is not None:
return "llama"
vocab_file = "vocab.json"
path_candidates.append(vocab_file)
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
if path_candidate is not None:
return "gpt2"
vocab_file = "tokenizer.json"
path_candidates.append(vocab_file)
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
if path_candidate:
if not self.has_newline_token():
return "gpt2"
return "llama"
raise FileNotFoundError(
f"Could not find {path_candidates} in {self.fname_tokenizer} or its parent; "
"if it's in another directory, pass the directory as --vocab-dir"
)
def __repr__(self) -> str: def __repr__(self) -> str:
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>" return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_dict)} added tokens>"
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab' Vocab: TypeAlias = 'VocabLoader'
# #
# data loading # data loading
@ -585,7 +627,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
if any("model.embed_tokens.weight" in mp.model for mp in models_plus): if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
# Transformers models put different tensors in different files, but # Transformers models put different tensors in different files, but
# don't split indivdual tensors between files. # don't split individual tensors between files.
model: LazyModel = {} model: LazyModel = {}
for mp in models_plus: for mp in models_plus:
model.update(mp.model) model.update(mp.model)
@ -678,7 +720,7 @@ class LazyUnpickler(pickle.Unpickler):
return func(*args) return func(*args)
CLASSES: dict[tuple[str, str], Any] = { CLASSES: dict[tuple[str, str], Any] = {
# getattr used here as a workaround for mypy not being smart enough to detrmine # getattr used here as a workaround for mypy not being smart enough to determine
# the staticmethods have a __func__ attribute. # the staticmethods have a __func__ attribute.
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'), ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'), ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
@ -794,20 +836,27 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
yield result yield result
def check_vocab_size(params: Params, vocab: Vocab) -> None: def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
if params.n_vocab != vocab.vocab_size: if params.n_vocab != vocab.vocab_size:
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab) if params.n_vocab == vocab.vocab_size:
if params.n_vocab == vocab.vocab_size_base:
print("Ignoring added_tokens.json since model matches vocab size without it.") print("Ignoring added_tokens.json since model matches vocab size without it.")
vocab.added_tokens_list = [] vocab.added_tokens_dict = OrderedDict()
vocab.vocab_size = vocab.vocab_size_base vocab.vocab_size = vocab.vocab_size
return
if pad_vocab and params.n_vocab > vocab.vocab_size:
pad_count = params.n_vocab - vocab.vocab_size
print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>')
for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
vocab.added_tokens_dict[f'<dummy{i:05}>'] = -1
vocab.vocab_size = params.n_vocab
return return
msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}" msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
if vocab.fname_added_tokens is not None:
msg += f" combined with {vocab.fname_added_tokens}"
msg += f" has {vocab.vocab_size})." msg += f" has {vocab.vocab_size})."
if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None: if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})." msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
if vocab.vocab_size < params.n_vocab:
msg += " Possibly try using the --padvocab option."
raise Exception(msg) raise Exception(msg)
@ -832,7 +881,17 @@ class OutputFile:
self.gguf.add_rope_dimension_count(params.n_embd // params.n_head) self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
self.gguf.add_head_count (params.n_head) self.gguf.add_head_count (params.n_head)
self.gguf.add_head_count_kv (params.n_head_kv) self.gguf.add_head_count_kv (params.n_head_kv)
if params.n_experts:
self.gguf.add_expert_count(params.n_experts)
if params.n_experts_used:
self.gguf.add_expert_used_count(params.n_experts_used)
if params.f_norm_eps:
self.gguf.add_layer_norm_rms_eps(params.f_norm_eps) self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
else:
raise ValueError('f_norm_eps is None')
if params.f_rope_freq_base is not None: if params.f_rope_freq_base is not None:
self.gguf.add_rope_freq_base(params.f_rope_freq_base) self.gguf.add_rope_freq_base(params.f_rope_freq_base)
@ -861,12 +920,8 @@ class OutputFile:
scores.append(score) scores.append(score)
toktypes.append(toktype) toktypes.append(toktype)
if isinstance(vocab, SentencePieceVocab): vocab_type = vocab.get_vocab_type()
self.gguf.add_tokenizer_model("llama") self.gguf.add_tokenizer_model(vocab_type)
elif isinstance(vocab, BpeVocab):
self.gguf.add_tokenizer_model("gpt2")
else:
raise ValueError('Unknown vocab type: Not BpeVocab or SentencePieceVocab')
self.gguf.add_token_list(tokens) self.gguf.add_token_list(tokens)
self.gguf.add_token_scores(scores) self.gguf.add_token_scores(scores)
self.gguf.add_token_types(toktypes) self.gguf.add_token_types(toktypes)
@ -892,8 +947,12 @@ class OutputFile:
self.gguf.close() self.gguf.close()
@staticmethod @staticmethod
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None: def write_vocab_only(
check_vocab_size(params, vocab) fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False,
) -> None:
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
of = OutputFile(fname_out, endianess=endianess) of = OutputFile(fname_out, endianess=endianess)
@ -920,8 +979,13 @@ class OutputFile:
return dt.quantize(arr) return dt.quantize(arr)
@staticmethod @staticmethod
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None: def write_all(
check_vocab_size(params, vocab) fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
concurrency: int = DEFAULT_CONCURRENCY,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False,
) -> None:
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
of = OutputFile(fname_out, endianess=endianess) of = OutputFile(fname_out, endianess=endianess)
@ -1079,35 +1143,17 @@ def load_some_model(path: Path) -> ModelPlus:
return model_plus return model_plus
def load_vocab(path: Path, vocabtype: str | None) -> Vocab: def find_vocab_file_path(path: Path, vocab_file: str) -> Optional[Path]:
# Be extra-friendly and accept either a file or a directory. Also, if it's
# a directory, it might be the model directory, and tokenizer.model might
# be in the parent of that.
if path.is_dir():
vocab_file = "tokenizer.model"
if vocabtype == 'bpe':
vocab_file = "vocab.json"
path2 = path / vocab_file path2 = path / vocab_file
# Use `.parent` instead of /.. to handle the symlink case better. # Use `.parent` instead of /.. to handle the symlink case better.
path3 = path.parent / vocab_file path3 = path.parent / vocab_file
if path2.exists(): if path2.exists():
path = path2 return path2
elif path3.exists(): if path3.exists():
path = path3 return path3
else:
raise FileNotFoundError(
f"Could not find {vocab_file} in {path} or its parent; "
"if it's in another directory, pass the directory as --vocab-dir")
print(f"Loading vocab file '{path}', type '{vocabtype}'") return None
added_tokens_path = path.parent / "added_tokens.json"
if vocabtype == "bpe":
return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
elif vocabtype == "spm":
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
else:
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
@ -1145,11 +1191,11 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin, *.safetensors)") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY) parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
args = parser.parse_args(args_in) args = parser.parse_args(args_in)
if args.dump_single: if args.dump_single:
@ -1192,12 +1238,13 @@ def main(args_in: list[str] | None = None) -> None:
if not args.outfile: if not args.outfile:
raise ValueError("need --outfile if using --vocab-only") raise ValueError("need --outfile if using --vocab-only")
# FIXME: Try to respect vocab_dir somehow? # FIXME: Try to respect vocab_dir somehow?
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype) vocab = VocabLoader(params, args.vocab_dir or args.model)
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
load_merges = args.vocabtype == 'bpe', load_merges = True,
n_vocab = vocab.vocab_size) n_vocab = vocab.vocab_size)
outfile = args.outfile outfile = args.outfile
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab) OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
endianess = endianess, pad_vocab = args.padvocab)
print(f"Wrote {outfile}") print(f"Wrote {outfile}")
return return
@ -1205,12 +1252,15 @@ def main(args_in: list[str] | None = None) -> None:
vocab = model_plus.vocab vocab = model_plus.vocab
else: else:
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
vocab = load_vocab(vocab_dir, args.vocabtype) vocab = VocabLoader(params, vocab_dir)
# FIXME: Try to respect vocab_dir somehow? # FIXME: Try to respect vocab_dir somehow?
print(f"Vocab info: {vocab}")
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
load_merges = args.vocabtype == 'bpe', load_merges = True,
n_vocab = vocab.vocab_size) n_vocab = vocab.vocab_size)
print(f"Special vocab info: {special_vocab}")
model = model_plus.model model = model_plus.model
model = convert_model_names(model, params) model = convert_model_names(model, params)
ftype = pick_output_type(model, args.outtype) ftype = pick_output_type(model, args.outtype)
@ -1220,7 +1270,8 @@ def main(args_in: list[str] | None = None) -> None:
params.ftype = ftype params.ftype = ftype
print(f"Writing {outfile}, format {ftype}") print(f"Writing {outfile}, format {ftype}")
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess) OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab)
print(f"Wrote {outfile}") print(f"Wrote {outfile}")

View file

@ -1258,9 +1258,9 @@ static struct ggml_tensor * forward_lora(
} }
static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
assert(logits->n_dims == 2); assert(ggml_is_matrix(logits));
assert(probs->n_dims == 2); assert(ggml_is_matrix(probs));
assert(best_samples->n_dims == 1); assert(ggml_is_vector(best_samples));
assert(logits->ne[1] == best_samples->ne[0]); assert(logits->ne[1] == best_samples->ne[0]);
assert(logits->ne[0] == probs->ne[0]); assert(logits->ne[0] == probs->ne[0]);
assert(logits->ne[1] == probs->ne[1]); assert(logits->ne[1] == probs->ne[1]);
@ -1292,9 +1292,9 @@ static void sample_softmax_batch(
struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
struct ggml_tensor * best_samples struct ggml_tensor * best_samples
) { ) {
GGML_ASSERT(best_samples->n_dims == 2); GGML_ASSERT(ggml_is_matrix(best_samples));
GGML_ASSERT(logits->n_dims == 3); GGML_ASSERT(ggml_is_3d(logits));
GGML_ASSERT(probs->n_dims == 3); GGML_ASSERT(ggml_is_3d(probs));
int n_tokens = best_samples->ne[0]; int n_tokens = best_samples->ne[0];
int n_batch = best_samples->ne[1]; int n_batch = best_samples->ne[1];
int n_vocab = logits->ne[0]; int n_vocab = logits->ne[0];
@ -1334,7 +1334,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
} }
static void print_matrix(struct ggml_tensor * probs) { static void print_matrix(struct ggml_tensor * probs) {
assert(probs->n_dims == 2); assert(ggml_is_matrix(probs));
for (int i = 0; i < probs->ne[1]; ++i) { for (int i = 0; i < probs->ne[1]; ++i) {
for (int k = 0; k < probs->ne[0]; ++k) { for (int k = 0; k < probs->ne[0]; ++k) {
float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
@ -1386,8 +1386,8 @@ static void get_example_targets(int example_id, struct ggml_tensor * tokens_inpu
static void get_example_targets_batch( static void get_example_targets_batch(
struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
) { ) {
GGML_ASSERT(tokens_input->n_dims == 2); GGML_ASSERT(ggml_is_matrix(tokens_input));
GGML_ASSERT( targets->n_dims == 3); GGML_ASSERT(ggml_is_3d(targets));
int n_tokens = tokens_input->ne[0]; int n_tokens = tokens_input->ne[0];
int n_batch = tokens_input->ne[1]; int n_batch = tokens_input->ne[1];
GGML_ASSERT(n_tokens == targets->ne[1]); GGML_ASSERT(n_tokens == targets->ne[1]);

View file

@ -129,13 +129,13 @@ int main(int argc, char ** argv) {
const ggml_type qtype = GGML_TYPE_Q4_1; const ggml_type qtype = GGML_TYPE_Q4_1;
size_t ctx_size = 0; size_t ctx_size = 0;
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32); ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
ctx_size += sizex*sizey*ggml_type_sizef(qtype); ctx_size += ggml_row_size(qtype, sizex*sizey);
ctx_size += sizex*sizey*ggml_type_sizef(qtype); ctx_size += ggml_row_size(qtype, sizex*sizey);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
ctx_size += 1024*1024*16; ctx_size += 1024*1024*16;
printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024)); printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));

View file

@ -427,7 +427,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
} }
static void print_matrix(struct ggml_tensor * probs) { static void print_matrix(struct ggml_tensor * probs) {
assert(probs->n_dims == 2); assert(ggml_is_matrix(probs));
for (int i = 0; i < probs->ne[1]; ++i) { for (int i = 0; i < probs->ne[1]; ++i) {
for (int k = 0; k < probs->ne[0]; ++k) { for (int k = 0; k < probs->ne[0]; ++k) {
float p = get_f32_2d(probs, k, i); float p = get_f32_2d(probs, k, i);
@ -639,7 +639,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
int ct; int ct;
switch (gg_weights->n_dims){ switch (ggml_n_dims(gg_weights)) {
case 1: case 1:
ct = 0; ct = 0;
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){ for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){

View file

@ -1110,7 +1110,7 @@ static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor,
name = ggml_get_name(tensor); name = ggml_get_name(tensor);
} }
uint32_t name_len = strlen(name); uint32_t name_len = strlen(name);
uint32_t nd = tensor->n_dims; uint32_t nd = ggml_n_dims(tensor);
uint32_t ne[4] = { (uint32_t)tensor->ne[0], uint32_t ne[4] = { (uint32_t)tensor->ne[0],
(uint32_t)tensor->ne[1], (uint32_t)tensor->ne[1],
(uint32_t)tensor->ne[2], (uint32_t)tensor->ne[2],

View file

@ -195,7 +195,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data); printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data);
// print first 10 elements // print first 10 elements
const float * data = (const float *) cur->data; const float * data = (const float *) cur->data;

View file

@ -514,7 +514,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
ctx_size += padded_size; ctx_size += padded_size;
if (verbosity >= 3) { if (verbosity >= 3) {
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i, printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i,
cur->n_dims, cur->name, tensor_size, padded_size, offset); ggml_n_dims(cur), cur->name, tensor_size, padded_size, offset);
} }
} }
} }
@ -739,7 +739,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
temp->ny = longer_side; temp->ny = longer_side;
temp->size = 3 * longer_side * longer_side; temp->size = 3 * longer_side * longer_side;
temp->data = new uint8_t[temp->size](); temp->data = new uint8_t[temp->size]();
uint8_t bc[3] = {122, 116, 104}; // bakground color in RGB from LLaVA uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
// fill with background color // fill with background color
for (size_t i = 0; i < temp->size; i++) { for (size_t i = 0; i < temp->size; i++) {
@ -962,7 +962,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
} }
// quantize only 2D tensors // quantize only 2D tensors
quantize &= (cur->n_dims == 2); quantize &= (ggml_n_dims(cur) == 2);
if (quantize) { if (quantize) {
new_type = type; new_type = type;
@ -1035,7 +1035,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
fout.put(0); fout.put(0);
} }
printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), cur->n_dims, quantize, printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
} }

View file

@ -51,7 +51,7 @@ def bytes_to_unicode():
The reversible bpe codes work on unicode strings. The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a signficant percentage of your normal, say, 32K bpe vocab. This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings. To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on. And avoids mapping to whitespace/control characters the bpe code barfs on.
""" """

View file

@ -1,6 +1,6 @@
# llama.cpp/examples/lookahead # llama.cpp/examples/lookahead
Demonstartion of lookahead decoding technique: Demonstration of lookahead decoding technique:
https://lmsys.org/blog/2023-11-21-lookahead-decoding/ https://lmsys.org/blog/2023-11-21-lookahead-decoding/

View file

@ -11227,7 +11227,7 @@ class binary_reader
} }
if (is_ndarray) // ndarray dimensional vector can only contain integers, and can not embed another array if (is_ndarray) // ndarray dimensional vector can only contain integers, and can not embed another array
{ {
return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read, exception_message(input_format, "ndarray dimentional vector is not allowed", "size"), nullptr)); return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read, exception_message(input_format, "ndarray dimensional vector is not allowed", "size"), nullptr));
} }
std::vector<size_t> dim; std::vector<size_t> dim;
if (JSON_HEDLEY_UNLIKELY(!get_ubjson_ndarray_size(dim))) if (JSON_HEDLEY_UNLIKELY(!get_ubjson_ndarray_size(dim)))

View file

@ -34,7 +34,8 @@ export async function* llama(prompt, params = {}, config = {}) {
headers: { headers: {
'Connection': 'keep-alive', 'Connection': 'keep-alive',
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'Accept': 'text/event-stream' 'Accept': 'text/event-stream',
...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
}, },
signal: controller.signal, signal: controller.signal,
}); });
@ -114,7 +115,7 @@ export async function* llama(prompt, params = {}, config = {}) {
return content; return content;
} }
// Call llama, return an event target that you can subcribe to // Call llama, return an event target that you can subscribe to
// //
// Example: // Example:
// //

View file

@ -223,7 +223,7 @@
repeat_last_n: 256, // 0 = disable penalty, -1 = context size repeat_last_n: 256, // 0 = disable penalty, -1 = context size
repeat_penalty: 1.18, // 1.0 = disabled repeat_penalty: 1.18, // 1.0 = disabled
top_k: 40, // <= 0 to use vocab size top_k: 40, // <= 0 to use vocab size
top_p: 0.5, // 1.0 = disabled top_p: 0.95, // 1.0 = disabled
min_p: 0.05, // 0 = disabled min_p: 0.05, // 0 = disabled
tfs_z: 1.0, // 1.0 = disabled tfs_z: 1.0, // 1.0 = disabled
typical_p: 1.0, // 1.0 = disabled typical_p: 1.0, // 1.0 = disabled
@ -235,10 +235,11 @@
grammar: '', grammar: '',
n_probs: 0, // no completion_probabilities, n_probs: 0, // no completion_probabilities,
image_data: [], image_data: [],
cache_prompt: true cache_prompt: true,
api_key: ''
}) })
/* START: Support for storing prompt templates and parameters in borwser LocalStorage */ /* START: Support for storing prompt templates and parameters in browsers LocalStorage */
const local_storage_storageKey = "llamacpp_server_local_storage"; const local_storage_storageKey = "llamacpp_server_local_storage";
@ -282,7 +283,7 @@
let importedTemplates = local_storage_getDataAsObject('user_templates') let importedTemplates = local_storage_getDataAsObject('user_templates')
if (importedTemplates) { if (importedTemplates) {
// saved templates were successfuly imported. // saved templates were successfully imported.
console.log('Processing saved templates and updating default template') console.log('Processing saved templates and updating default template')
params.value = { ...params.value, image_data: [] }; params.value = { ...params.value, image_data: [] };
@ -303,7 +304,7 @@
} }
function userTemplateResetToDefault() { function userTemplateResetToDefault() {
console.log('Reseting themplate to default') console.log('Resetting template to default')
selectedUserTemplate.value.name = 'default'; selectedUserTemplate.value.name = 'default';
selectedUserTemplate.value.data = savedUserTemplates.value['default']; selectedUserTemplate.value.data = savedUserTemplates.value['default'];
} }
@ -762,7 +763,7 @@
<fieldset class="two"> <fieldset class="two">
${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })} ${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
${FloatField({ label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })} ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })} ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })} ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })} ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
@ -790,6 +791,10 @@
<fieldset> <fieldset>
${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })} ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
</fieldset> </fieldset>
<fieldset>
<label for="api_key">API Key</label>
<input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
</fieldset>
</details> </details>
</form> </form>
` `

View file

@ -36,6 +36,7 @@ using json = nlohmann::json;
struct server_params struct server_params
{ {
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::string api_key;
std::string public_path = "examples/server/public"; std::string public_path = "examples/server/public";
int32_t port = 8080; int32_t port = 8080;
int32_t read_timeout = 600; int32_t read_timeout = 600;
@ -376,7 +377,6 @@ struct llama_client_slot
int32_t num_prompt_tokens = 0; int32_t num_prompt_tokens = 0;
int32_t num_prompt_tokens_processed = 0; int32_t num_prompt_tokens_processed = 0;
int32_t multibyte_pending = 0;
json prompt; json prompt;
std::string generated_text; std::string generated_text;
@ -425,7 +425,6 @@ struct llama_client_slot
stopped_word = false; stopped_word = false;
stopped_limit = false; stopped_limit = false;
stopping_word = ""; stopping_word = "";
multibyte_pending = 0;
n_past = 0; n_past = 0;
sent_count = 0; sent_count = 0;
sent_token_probs_index = 0; sent_token_probs_index = 0;
@ -992,35 +991,36 @@ struct llama_server_context
slot.generated_text += token_str; slot.generated_text += token_str;
slot.has_next_token = true; slot.has_next_token = true;
if (slot.multibyte_pending > 0) // check if there is incomplete UTF-8 character at the end
bool incomplete = false;
for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i)
{ {
slot.multibyte_pending -= token_str.size(); unsigned char c = slot.generated_text[slot.generated_text.size() - i];
if ((c & 0xC0) == 0x80)
{
// continuation byte: 10xxxxxx
continue;
} }
else if (token_str.size() == 1)
{
const char c = token_str[0];
// 2-byte characters: 110xxxxx 10xxxxxx
if ((c & 0xE0) == 0xC0) if ((c & 0xE0) == 0xC0)
{ {
slot.multibyte_pending = 1; // 2-byte character: 110xxxxx ...
// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx incomplete = i < 2;
} }
else if ((c & 0xF0) == 0xE0) else if ((c & 0xF0) == 0xE0)
{ {
slot.multibyte_pending = 2; // 3-byte character: 1110xxxx ...
// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx incomplete = i < 3;
} }
else if ((c & 0xF8) == 0xF0) else if ((c & 0xF8) == 0xF0)
{ {
slot.multibyte_pending = 3; // 4-byte character: 11110xxx ...
} incomplete = i < 4;
else
{
slot.multibyte_pending = 0;
} }
// else 1-byte character or invalid byte
break;
} }
if (slot.multibyte_pending == 0) if (!incomplete)
{ {
size_t pos = std::min(slot.sent_count, slot.generated_text.size()); size_t pos = std::min(slot.sent_count, slot.generated_text.size());
const std::string str_test = slot.generated_text.substr(pos); const std::string str_test = slot.generated_text.substr(pos);
@ -1055,7 +1055,7 @@ struct llama_server_context
} }
} }
if (slot.multibyte_pending > 0 && !slot.has_next_token) if (incomplete)
{ {
slot.has_next_token = true; slot.has_next_token = true;
} }
@ -1954,6 +1954,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel); printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
@ -2003,6 +2004,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
} }
sparams.public_path = argv[i]; sparams.public_path = argv[i];
} }
else if (arg == "--api-key")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
sparams.api_key = argv[i];
}
else if (arg == "--timeout" || arg == "-to") else if (arg == "--timeout" || arg == "-to")
{ {
if (++i >= argc) if (++i >= argc)
@ -2382,6 +2392,7 @@ json oaicompat_completion_params_parse(
llama_params["__oaicompat"] = true; llama_params["__oaicompat"] = true;
// Map OpenAI parameters to llama.cpp parameters // Map OpenAI parameters to llama.cpp parameters
llama_params["model"] = json_value(body, "model", std::string("uknown"));
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
llama_params["temperature"] = json_value(body, "temperature", 0.8); llama_params["temperature"] = json_value(body, "temperature", 0.8);
@ -2669,6 +2680,32 @@ int main(int argc, char **argv)
httplib::Server svr; httplib::Server svr;
// Middleware for API key validation
auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
// If API key is not set, skip validation
if (sparams.api_key.empty()) {
return true;
}
// Check for API key in the header
auto auth_header = req.get_header_value("Authorization");
std::string prefix = "Bearer ";
if (auth_header.substr(0, prefix.size()) == prefix) {
std::string received_api_key = auth_header.substr(prefix.size());
if (received_api_key == sparams.api_key) {
return true; // API key is valid
}
}
// API key is invalid or not provided
res.set_content("Unauthorized: Invalid API Key", "text/plain");
res.status = 401; // Unauthorized
LOG_WARNING("Unauthorized: Invalid API Key", {});
return false;
};
svr.set_default_headers({{"Server", "llama.cpp"}, svr.set_default_headers({{"Server", "llama.cpp"},
{"Access-Control-Allow-Origin", "*"}, {"Access-Control-Allow-Origin", "*"},
{"Access-Control-Allow-Headers", "content-type"}}); {"Access-Control-Allow-Headers", "content-type"}});
@ -2711,8 +2748,11 @@ int main(int argc, char **argv)
res.set_content(data.dump(), "application/json"); res.set_content(data.dump(), "application/json");
}); });
svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res) svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
{ {
if (!validate_api_key(req, res)) {
return;
}
json data = json::parse(req.body); json data = json::parse(req.body);
const int task_id = llama.request_completion(data, false, false, -1); const int task_id = llama.request_completion(data, false, false, -1);
if (!json_value(data, "stream", false)) { if (!json_value(data, "stream", false)) {
@ -2799,8 +2839,11 @@ int main(int argc, char **argv)
}); });
// TODO: add mount point without "/v1" prefix -- how? // TODO: add mount point without "/v1" prefix -- how?
svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, httplib::Response &res) svr.Post("/v1/chat/completions", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
{ {
if (!validate_api_key(req, res)) {
return;
}
json data = oaicompat_completion_params_parse(json::parse(req.body)); json data = oaicompat_completion_params_parse(json::parse(req.body));
const int task_id = llama.request_completion(data, false, false, -1); const int task_id = llama.request_completion(data, false, false, -1);
@ -2869,8 +2912,11 @@ int main(int argc, char **argv)
} }
}); });
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res) svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
{ {
if (!validate_api_key(req, res)) {
return;
}
json data = json::parse(req.body); json data = json::parse(req.body);
const int task_id = llama.request_completion(data, true, false, -1); const int task_id = llama.request_completion(data, true, false, -1);
if (!json_value(data, "stream", false)) { if (!json_value(data, "stream", false)) {
@ -3005,11 +3051,15 @@ int main(int argc, char **argv)
svr.set_error_handler([](const httplib::Request &, httplib::Response &res) svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
{ {
if (res.status == 401)
{
res.set_content("Unauthorized", "text/plain");
}
if (res.status == 400) if (res.status == 400)
{ {
res.set_content("Invalid request", "text/plain"); res.set_content("Invalid request", "text/plain");
} }
else if (res.status != 500) else if (res.status == 404)
{ {
res.set_content("File Not Found", "text/plain"); res.set_content("File Not Found", "text/plain");
res.status = 404; res.status = 404;
@ -3032,11 +3082,15 @@ int main(int argc, char **argv)
// to make it ctrl+clickable: // to make it ctrl+clickable:
LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
LOG_INFO("HTTP server listening", { std::unordered_map<std::string, std::string> log_data;
{"hostname", sparams.hostname}, log_data["hostname"] = sparams.hostname;
{"port", sparams.port}, log_data["port"] = std::to_string(sparams.port);
});
if (!sparams.api_key.empty()) {
log_data["api_key"] = "api_key: ****" + sparams.api_key.substr(sparams.api_key.length() - 4);
}
LOG_INFO("HTTP server listening", log_data);
// run the HTTP server in a thread - see comment below // run the HTTP server in a thread - see comment below
std::thread t([&]() std::thread t([&]()
{ {

View file

@ -1,6 +1,6 @@
# llama.cpp/examples/speculative # llama.cpp/examples/speculative
Demonstartion of speculative decoding and tree-based speculative decoding techniques Demonstration of speculative decoding and tree-based speculative decoding techniques
More info: More info:

View file

@ -428,7 +428,7 @@ int main(int argc, char ** argv) {
++n_past_tgt; ++n_past_tgt;
} }
// the first token is always proposed by the traget model before the speculation loop so we erase it here // the first token is always proposed by the target model before the speculation loop so we erase it here
for (int s = 0; s < n_seq_dft; ++s) { for (int s = 0; s < n_seq_dft; ++s) {
if (!drafts[s].active) { if (!drafts[s].active) {
continue; continue;

View file

@ -43,7 +43,7 @@ GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph
// ggml-backend v2 API // ggml-backend v2 API
// //
// Seperate tensor and graph allocator objects // Separate tensor and graph allocator objects
// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators // This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
// The original API is kept as a wrapper around the new API // The original API is kept as a wrapper around the new API

File diff suppressed because it is too large Load diff

View file

@ -66,9 +66,11 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(div_row); GGML_METAL_DECL_KERNEL(div_row);
GGML_METAL_DECL_KERNEL(scale); GGML_METAL_DECL_KERNEL(scale);
GGML_METAL_DECL_KERNEL(scale_4); GGML_METAL_DECL_KERNEL(scale_4);
GGML_METAL_DECL_KERNEL(silu); GGML_METAL_DECL_KERNEL(tanh);
GGML_METAL_DECL_KERNEL(relu); GGML_METAL_DECL_KERNEL(relu);
GGML_METAL_DECL_KERNEL(gelu); GGML_METAL_DECL_KERNEL(gelu);
GGML_METAL_DECL_KERNEL(gelu_quick);
GGML_METAL_DECL_KERNEL(silu);
GGML_METAL_DECL_KERNEL(soft_max); GGML_METAL_DECL_KERNEL(soft_max);
GGML_METAL_DECL_KERNEL(soft_max_4); GGML_METAL_DECL_KERNEL(soft_max_4);
GGML_METAL_DECL_KERNEL(diag_mask_inf); GGML_METAL_DECL_KERNEL(diag_mask_inf);
@ -86,6 +88,7 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(get_rows_q5_K); GGML_METAL_DECL_KERNEL(get_rows_q5_K);
GGML_METAL_DECL_KERNEL(get_rows_q6_K); GGML_METAL_DECL_KERNEL(get_rows_q6_K);
GGML_METAL_DECL_KERNEL(rms_norm); GGML_METAL_DECL_KERNEL(rms_norm);
GGML_METAL_DECL_KERNEL(group_norm);
GGML_METAL_DECL_KERNEL(norm); GGML_METAL_DECL_KERNEL(norm);
GGML_METAL_DECL_KERNEL(mul_mv_f32_f32); GGML_METAL_DECL_KERNEL(mul_mv_f32_f32);
GGML_METAL_DECL_KERNEL(mul_mv_f16_f16); GGML_METAL_DECL_KERNEL(mul_mv_f16_f16);
@ -102,6 +105,21 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32); GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32);
GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32); GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32);
GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32); GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32);
GGML_METAL_DECL_KERNEL(mul_mv_id_f32_f32);
//GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f16);
GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32);
//GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32_1row);
//GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32_l4);
GGML_METAL_DECL_KERNEL(mul_mv_id_q4_0_f32);
GGML_METAL_DECL_KERNEL(mul_mv_id_q4_1_f32);
GGML_METAL_DECL_KERNEL(mul_mv_id_q5_0_f32);
GGML_METAL_DECL_KERNEL(mul_mv_id_q5_1_f32);
GGML_METAL_DECL_KERNEL(mul_mv_id_q8_0_f32);
GGML_METAL_DECL_KERNEL(mul_mv_id_q2_K_f32);
GGML_METAL_DECL_KERNEL(mul_mv_id_q3_K_f32);
GGML_METAL_DECL_KERNEL(mul_mv_id_q4_K_f32);
GGML_METAL_DECL_KERNEL(mul_mv_id_q5_K_f32);
GGML_METAL_DECL_KERNEL(mul_mv_id_q6_K_f32);
GGML_METAL_DECL_KERNEL(mul_mm_f32_f32); GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32); GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32); GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
@ -130,8 +148,11 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(rope_f16); GGML_METAL_DECL_KERNEL(rope_f16);
GGML_METAL_DECL_KERNEL(alibi_f32); GGML_METAL_DECL_KERNEL(alibi_f32);
GGML_METAL_DECL_KERNEL(im2col_f16); GGML_METAL_DECL_KERNEL(im2col_f16);
GGML_METAL_DECL_KERNEL(upscale_f32);
GGML_METAL_DECL_KERNEL(pad_f32);
GGML_METAL_DECL_KERNEL(argsort_f32_i32_asc); GGML_METAL_DECL_KERNEL(argsort_f32_i32_asc);
GGML_METAL_DECL_KERNEL(argsort_f32_i32_desc); GGML_METAL_DECL_KERNEL(argsort_f32_i32_desc);
GGML_METAL_DECL_KERNEL(leaky_relu_f32);
GGML_METAL_DECL_KERNEL(cpy_f32_f16); GGML_METAL_DECL_KERNEL(cpy_f32_f16);
GGML_METAL_DECL_KERNEL(cpy_f32_f32); GGML_METAL_DECL_KERNEL(cpy_f32_f32);
GGML_METAL_DECL_KERNEL(cpy_f32_q8_0); GGML_METAL_DECL_KERNEL(cpy_f32_q8_0);
@ -140,6 +161,7 @@ struct ggml_metal_context {
//GGML_METAL_DECL_KERNEL(cpy_f32_q5_0); //GGML_METAL_DECL_KERNEL(cpy_f32_q5_0);
//GGML_METAL_DECL_KERNEL(cpy_f32_q5_1); //GGML_METAL_DECL_KERNEL(cpy_f32_q5_1);
GGML_METAL_DECL_KERNEL(cpy_f16_f16); GGML_METAL_DECL_KERNEL(cpy_f16_f16);
GGML_METAL_DECL_KERNEL(cpy_f16_f32);
GGML_METAL_DECL_KERNEL(concat); GGML_METAL_DECL_KERNEL(concat);
GGML_METAL_DECL_KERNEL(sqr); GGML_METAL_DECL_KERNEL(sqr);
GGML_METAL_DECL_KERNEL(sum_rows); GGML_METAL_DECL_KERNEL(sum_rows);
@ -177,6 +199,8 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data); ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data);
} else { } else {
char* buffer2 = malloc(len+1); char* buffer2 = malloc(len+1);
va_end(args);
va_start(args, format);
vsnprintf(buffer2, len+1, format, args); vsnprintf(buffer2, len+1, format, args);
buffer2[len] = 0; buffer2[len] = 0;
ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data); ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data);
@ -316,9 +340,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(div_row); GGML_METAL_ADD_KERNEL(div_row);
GGML_METAL_ADD_KERNEL(scale); GGML_METAL_ADD_KERNEL(scale);
GGML_METAL_ADD_KERNEL(scale_4); GGML_METAL_ADD_KERNEL(scale_4);
GGML_METAL_ADD_KERNEL(silu); GGML_METAL_ADD_KERNEL(tanh);
GGML_METAL_ADD_KERNEL(relu); GGML_METAL_ADD_KERNEL(relu);
GGML_METAL_ADD_KERNEL(gelu); GGML_METAL_ADD_KERNEL(gelu);
GGML_METAL_ADD_KERNEL(gelu_quick);
GGML_METAL_ADD_KERNEL(silu);
GGML_METAL_ADD_KERNEL(soft_max); GGML_METAL_ADD_KERNEL(soft_max);
GGML_METAL_ADD_KERNEL(soft_max_4); GGML_METAL_ADD_KERNEL(soft_max_4);
GGML_METAL_ADD_KERNEL(diag_mask_inf); GGML_METAL_ADD_KERNEL(diag_mask_inf);
@ -336,6 +362,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(get_rows_q5_K); GGML_METAL_ADD_KERNEL(get_rows_q5_K);
GGML_METAL_ADD_KERNEL(get_rows_q6_K); GGML_METAL_ADD_KERNEL(get_rows_q6_K);
GGML_METAL_ADD_KERNEL(rms_norm); GGML_METAL_ADD_KERNEL(rms_norm);
GGML_METAL_ADD_KERNEL(group_norm);
GGML_METAL_ADD_KERNEL(norm); GGML_METAL_ADD_KERNEL(norm);
GGML_METAL_ADD_KERNEL(mul_mv_f32_f32); GGML_METAL_ADD_KERNEL(mul_mv_f32_f32);
GGML_METAL_ADD_KERNEL(mul_mv_f16_f16); GGML_METAL_ADD_KERNEL(mul_mv_f16_f16);
@ -352,6 +379,21 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32); GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32); GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
GGML_METAL_ADD_KERNEL(mul_mv_id_f32_f32);
//GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f16);
GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32);
//GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32_1row);
//GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32_l4);
GGML_METAL_ADD_KERNEL(mul_mv_id_q4_0_f32);
GGML_METAL_ADD_KERNEL(mul_mv_id_q4_1_f32);
GGML_METAL_ADD_KERNEL(mul_mv_id_q5_0_f32);
GGML_METAL_ADD_KERNEL(mul_mv_id_q5_1_f32);
GGML_METAL_ADD_KERNEL(mul_mv_id_q8_0_f32);
GGML_METAL_ADD_KERNEL(mul_mv_id_q2_K_f32);
GGML_METAL_ADD_KERNEL(mul_mv_id_q3_K_f32);
GGML_METAL_ADD_KERNEL(mul_mv_id_q4_K_f32);
GGML_METAL_ADD_KERNEL(mul_mv_id_q5_K_f32);
GGML_METAL_ADD_KERNEL(mul_mv_id_q6_K_f32);
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) { if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32); GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32); GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
@ -382,8 +424,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(rope_f16); GGML_METAL_ADD_KERNEL(rope_f16);
GGML_METAL_ADD_KERNEL(alibi_f32); GGML_METAL_ADD_KERNEL(alibi_f32);
GGML_METAL_ADD_KERNEL(im2col_f16); GGML_METAL_ADD_KERNEL(im2col_f16);
GGML_METAL_ADD_KERNEL(upscale_f32);
GGML_METAL_ADD_KERNEL(pad_f32);
GGML_METAL_ADD_KERNEL(argsort_f32_i32_asc); GGML_METAL_ADD_KERNEL(argsort_f32_i32_asc);
GGML_METAL_ADD_KERNEL(argsort_f32_i32_desc); GGML_METAL_ADD_KERNEL(argsort_f32_i32_desc);
GGML_METAL_ADD_KERNEL(leaky_relu_f32);
GGML_METAL_ADD_KERNEL(cpy_f32_f16); GGML_METAL_ADD_KERNEL(cpy_f32_f16);
GGML_METAL_ADD_KERNEL(cpy_f32_f32); GGML_METAL_ADD_KERNEL(cpy_f32_f32);
GGML_METAL_ADD_KERNEL(cpy_f32_q8_0); GGML_METAL_ADD_KERNEL(cpy_f32_q8_0);
@ -392,6 +437,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
//GGML_METAL_ADD_KERNEL(cpy_f32_q5_0); //GGML_METAL_ADD_KERNEL(cpy_f32_q5_0);
//GGML_METAL_ADD_KERNEL(cpy_f32_q5_1); //GGML_METAL_ADD_KERNEL(cpy_f32_q5_1);
GGML_METAL_ADD_KERNEL(cpy_f16_f16); GGML_METAL_ADD_KERNEL(cpy_f16_f16);
GGML_METAL_ADD_KERNEL(cpy_f16_f32);
GGML_METAL_ADD_KERNEL(concat); GGML_METAL_ADD_KERNEL(concat);
GGML_METAL_ADD_KERNEL(sqr); GGML_METAL_ADD_KERNEL(sqr);
GGML_METAL_ADD_KERNEL(sum_rows); GGML_METAL_ADD_KERNEL(sum_rows);
@ -416,9 +462,11 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(div_row); GGML_METAL_DEL_KERNEL(div_row);
GGML_METAL_DEL_KERNEL(scale); GGML_METAL_DEL_KERNEL(scale);
GGML_METAL_DEL_KERNEL(scale_4); GGML_METAL_DEL_KERNEL(scale_4);
GGML_METAL_DEL_KERNEL(silu); GGML_METAL_DEL_KERNEL(tanh);
GGML_METAL_DEL_KERNEL(relu); GGML_METAL_DEL_KERNEL(relu);
GGML_METAL_DEL_KERNEL(gelu); GGML_METAL_DEL_KERNEL(gelu);
GGML_METAL_DEL_KERNEL(gelu_quick);
GGML_METAL_DEL_KERNEL(silu);
GGML_METAL_DEL_KERNEL(soft_max); GGML_METAL_DEL_KERNEL(soft_max);
GGML_METAL_DEL_KERNEL(soft_max_4); GGML_METAL_DEL_KERNEL(soft_max_4);
GGML_METAL_DEL_KERNEL(diag_mask_inf); GGML_METAL_DEL_KERNEL(diag_mask_inf);
@ -436,6 +484,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(get_rows_q5_K); GGML_METAL_DEL_KERNEL(get_rows_q5_K);
GGML_METAL_DEL_KERNEL(get_rows_q6_K); GGML_METAL_DEL_KERNEL(get_rows_q6_K);
GGML_METAL_DEL_KERNEL(rms_norm); GGML_METAL_DEL_KERNEL(rms_norm);
GGML_METAL_DEL_KERNEL(group_norm);
GGML_METAL_DEL_KERNEL(norm); GGML_METAL_DEL_KERNEL(norm);
GGML_METAL_DEL_KERNEL(mul_mv_f32_f32); GGML_METAL_DEL_KERNEL(mul_mv_f32_f32);
GGML_METAL_DEL_KERNEL(mul_mv_f16_f16); GGML_METAL_DEL_KERNEL(mul_mv_f16_f16);
@ -452,6 +501,21 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32); GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32); GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32); GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
GGML_METAL_DEL_KERNEL(mul_mv_id_f32_f32);
//GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f16);
GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32);
//GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32_1row);
//GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32_l4);
GGML_METAL_DEL_KERNEL(mul_mv_id_q4_0_f32);
GGML_METAL_DEL_KERNEL(mul_mv_id_q4_1_f32);
GGML_METAL_DEL_KERNEL(mul_mv_id_q5_0_f32);
GGML_METAL_DEL_KERNEL(mul_mv_id_q5_1_f32);
GGML_METAL_DEL_KERNEL(mul_mv_id_q8_0_f32);
GGML_METAL_DEL_KERNEL(mul_mv_id_q2_K_f32);
GGML_METAL_DEL_KERNEL(mul_mv_id_q3_K_f32);
GGML_METAL_DEL_KERNEL(mul_mv_id_q4_K_f32);
GGML_METAL_DEL_KERNEL(mul_mv_id_q5_K_f32);
GGML_METAL_DEL_KERNEL(mul_mv_id_q6_K_f32);
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) { if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32); GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32); GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
@ -482,8 +546,11 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(rope_f16); GGML_METAL_DEL_KERNEL(rope_f16);
GGML_METAL_DEL_KERNEL(alibi_f32); GGML_METAL_DEL_KERNEL(alibi_f32);
GGML_METAL_DEL_KERNEL(im2col_f16); GGML_METAL_DEL_KERNEL(im2col_f16);
GGML_METAL_DEL_KERNEL(upscale_f32);
GGML_METAL_DEL_KERNEL(pad_f32);
GGML_METAL_DEL_KERNEL(argsort_f32_i32_asc); GGML_METAL_DEL_KERNEL(argsort_f32_i32_asc);
GGML_METAL_DEL_KERNEL(argsort_f32_i32_desc); GGML_METAL_DEL_KERNEL(argsort_f32_i32_desc);
GGML_METAL_DEL_KERNEL(leaky_relu_f32);
GGML_METAL_DEL_KERNEL(cpy_f32_f16); GGML_METAL_DEL_KERNEL(cpy_f32_f16);
GGML_METAL_DEL_KERNEL(cpy_f32_f32); GGML_METAL_DEL_KERNEL(cpy_f32_f32);
GGML_METAL_DEL_KERNEL(cpy_f32_q8_0); GGML_METAL_DEL_KERNEL(cpy_f32_q8_0);
@ -492,6 +559,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
//GGML_METAL_DEL_KERNEL(cpy_f32_q5_0); //GGML_METAL_DEL_KERNEL(cpy_f32_q5_0);
//GGML_METAL_DEL_KERNEL(cpy_f32_q5_1); //GGML_METAL_DEL_KERNEL(cpy_f32_q5_1);
GGML_METAL_DEL_KERNEL(cpy_f16_f16); GGML_METAL_DEL_KERNEL(cpy_f16_f16);
GGML_METAL_DEL_KERNEL(cpy_f16_f32);
GGML_METAL_DEL_KERNEL(concat); GGML_METAL_DEL_KERNEL(concat);
GGML_METAL_DEL_KERNEL(sqr); GGML_METAL_DEL_KERNEL(sqr);
GGML_METAL_DEL_KERNEL(sum_rows); GGML_METAL_DEL_KERNEL(sum_rows);
@ -793,9 +861,11 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
switch (op->op) { switch (op->op) {
case GGML_OP_UNARY: case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) { switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_TANH:
case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_SILU:
return true; return true;
default: default:
return false; return false;
@ -807,6 +877,7 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
case GGML_OP_PERMUTE: case GGML_OP_PERMUTE:
case GGML_OP_CONCAT: case GGML_OP_CONCAT:
case GGML_OP_ADD: case GGML_OP_ADD:
case GGML_OP_ACC:
case GGML_OP_MUL: case GGML_OP_MUL:
case GGML_OP_DIV: case GGML_OP_DIV:
case GGML_OP_SCALE: case GGML_OP_SCALE:
@ -814,21 +885,50 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
case GGML_OP_GROUP_NORM:
case GGML_OP_NORM: case GGML_OP_NORM:
case GGML_OP_ALIBI: case GGML_OP_ALIBI:
case GGML_OP_ROPE: case GGML_OP_ROPE:
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
case GGML_OP_DUP: case GGML_OP_LEAKY_RELU:
case GGML_OP_CPY:
case GGML_OP_CONT:
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:
return true; return true;
case GGML_OP_CPY:
case GGML_OP_DUP:
case GGML_OP_CONT:
{
switch (op->src[0]->type) {
case GGML_TYPE_F32:
switch (op->type) {
case GGML_TYPE_F16:
case GGML_TYPE_F32:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
return true;
default:
return false;
}
case GGML_TYPE_F16:
switch (op->type) {
case GGML_TYPE_F16:
case GGML_TYPE_F32:
return true;
default:
return false;
}
default:
return false;
};
}
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:
case GGML_OP_GET_ROWS: case GGML_OP_GET_ROWS:
{ {
return op->ne[0] % 4 == 0; return op->ne[3] == 1;
} }
default: default:
return false; return false;
@ -904,7 +1004,10 @@ void ggml_metal_graph_compute(
} break; } break;
} }
GGML_ASSERT(ggml_metal_supports_op(dst)); if (!ggml_metal_supports_op(dst)) {
GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
GGML_ASSERT(!"unsupported op");
}
const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne00 = src0 ? src0->ne[0] : 0;
const int64_t ne01 = src0 ? src0->ne[1] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0;
@ -1001,34 +1104,39 @@ void ggml_metal_graph_compute(
case GGML_OP_MUL: case GGML_OP_MUL:
case GGML_OP_DIV: case GGML_OP_DIV:
{ {
GGML_ASSERT(ggml_is_contiguous(src0)); const size_t offs = 0;
GGML_ASSERT(ggml_is_contiguous(src1));
bool bcast_row = false; bool bcast_row = false;
int64_t nb = ne00; int64_t nb = ne00;
if (ggml_nelements(src1) == ne10 && ne00 % 4 == 0) { id<MTLComputePipelineState> pipeline = nil;
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
GGML_ASSERT(ggml_is_contiguous(src0));
// src1 is a row // src1 is a row
GGML_ASSERT(ne11 == 1); GGML_ASSERT(ne11 == 1);
nb = ne00 / 4; nb = ne00 / 4;
switch (dst->op) { switch (dst->op) {
case GGML_OP_ADD: [encoder setComputePipelineState:ctx->pipeline_add_row]; break; case GGML_OP_ADD: pipeline = ctx->pipeline_add_row; break;
case GGML_OP_MUL: [encoder setComputePipelineState:ctx->pipeline_mul_row]; break; case GGML_OP_MUL: pipeline = ctx->pipeline_mul_row; break;
case GGML_OP_DIV: [encoder setComputePipelineState:ctx->pipeline_div_row]; break; case GGML_OP_DIV: pipeline = ctx->pipeline_div_row; break;
default: GGML_ASSERT(false); default: GGML_ASSERT(false);
} }
bcast_row = true; bcast_row = true;
} else { } else {
switch (dst->op) { switch (dst->op) {
case GGML_OP_ADD: [encoder setComputePipelineState:ctx->pipeline_add]; break; case GGML_OP_ADD: pipeline = ctx->pipeline_add; break;
case GGML_OP_MUL: [encoder setComputePipelineState:ctx->pipeline_mul]; break; case GGML_OP_MUL: pipeline = ctx->pipeline_mul; break;
case GGML_OP_DIV: [encoder setComputePipelineState:ctx->pipeline_div]; break; case GGML_OP_DIV: pipeline = ctx->pipeline_div; break;
default: GGML_ASSERT(false); default: GGML_ASSERT(false);
} }
} }
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
@ -1056,18 +1164,99 @@ void ggml_metal_graph_compute(
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24]; [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24];
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25]; [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25];
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26]; [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26];
[encoder setBytes:&nb length:sizeof(nb) atIndex:27]; [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
[encoder setBytes:&nb length:sizeof(nb) atIndex:28];
if (bcast_row) { if (bcast_row) {
const int64_t n = ggml_nelements(dst)/4; const int64_t n = ggml_nelements(dst)/4;
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} else { } else {
const int nth = MIN(1024, ne0); const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} }
} break; } break;
case GGML_OP_ACC:
{
GGML_ASSERT(src0t == GGML_TYPE_F32);
GGML_ASSERT(src1t == GGML_TYPE_F32);
GGML_ASSERT(dstt == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
const size_t pnb1 = ((int32_t *) dst->op_params)[0];
const size_t pnb2 = ((int32_t *) dst->op_params)[1];
const size_t pnb3 = ((int32_t *) dst->op_params)[2];
const size_t offs = ((int32_t *) dst->op_params)[3];
const bool inplace = (bool) ((int32_t *) dst->op_params)[4];
if (!inplace) {
// run a separete kernel to cpy src->dst
// not sure how to avoid this
// TODO: make a simpler cpy_bytes kernel
const int nth = MIN(1024, ne00);
[encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
}
[encoder setComputePipelineState:ctx->pipeline_add];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
[encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8];
[encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9];
[encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10];
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
[encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
[encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20];
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21];
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22];
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23];
[encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24];
[encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25];
[encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
[encoder setBytes:&offs length:sizeof(offs) atIndex:27];
const int nth = MIN(1024, ne0);
[encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_SCALE: case GGML_OP_SCALE:
{ {
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
@ -1091,16 +1280,15 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_UNARY: case GGML_OP_UNARY:
switch (ggml_get_unary_op(gf->nodes[i])) { switch (ggml_get_unary_op(gf->nodes[i])) {
case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_TANH:
{ {
[encoder setComputePipelineState:ctx->pipeline_silu]; [encoder setComputePipelineState:ctx->pipeline_tanh];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst); const int64_t n = ggml_nelements(dst);
GGML_ASSERT(n % 4 == 0);
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_RELU:
{ {
@ -1121,6 +1309,28 @@ void ggml_metal_graph_compute(
const int64_t n = ggml_nelements(dst); const int64_t n = ggml_nelements(dst);
GGML_ASSERT(n % 4 == 0); GGML_ASSERT(n % 4 == 0);
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_UNARY_OP_GELU_QUICK:
{
[encoder setComputePipelineState:ctx->pipeline_gelu_quick];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
GGML_ASSERT(n % 4 == 0);
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_UNARY_OP_SILU:
{
[encoder setComputePipelineState:ctx->pipeline_silu];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
GGML_ASSERT(n % 4 == 0);
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
default: default:
@ -1193,7 +1403,11 @@ void ggml_metal_graph_compute(
const float scale = ((float *) dst->op_params)[0]; const float scale = ((float *) dst->op_params)[0];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
if (id_src1) {
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
} else {
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
}
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
@ -1444,7 +1658,7 @@ void ggml_metal_graph_compute(
else if (src0t == GGML_TYPE_Q6_K) { else if (src0t == GGML_TYPE_Q6_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else { } else {
int64_t ny = (ne11 + nrows - 1)/nrows; const int64_t ny = (ne11 + nrows - 1)/nrows;
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
} }
@ -1456,7 +1670,7 @@ void ggml_metal_graph_compute(
GGML_ASSERT(src0t == GGML_TYPE_I32); GGML_ASSERT(src0t == GGML_TYPE_I32);
const int n_as = ne00; const int n_as = ((int32_t *) dst->op_params)[1];
// TODO: make this more general // TODO: make this more general
GGML_ASSERT(n_as <= 8); GGML_ASSERT(n_as <= 8);
@ -1488,14 +1702,22 @@ void ggml_metal_graph_compute(
// find the break-even point where the matrix-matrix kernel becomes more efficient compared // find the break-even point where the matrix-matrix kernel becomes more efficient compared
// to the matrix-vector kernel // to the matrix-vector kernel
int ne11_mm_min = 0; int ne11_mm_min = 1;
const int idx = ((int32_t *) dst->op_params)[0]; const int idx = ((int32_t *) dst->op_params)[0];
// batch size
GGML_ASSERT(ne01 == ne11);
const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && // !!!
ne11 > ne11_mm_min) { // TODO: for now, always use mat-vec kernels until we figure out how to improve the
// indirect matrix multiplication
// !!!
if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && _ne1 > ne11_mm_min) {
switch (src2->type) { switch (src2->type) {
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32]; break; case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32]; break;
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32]; break; case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32]; break;
@ -1514,19 +1736,22 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne20 length:sizeof(ne20) atIndex:3]; [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3];
[encoder setBytes:&ne22 length:sizeof(ne22) atIndex:4]; [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
[encoder setBytes:&nb21 length:sizeof(nb21) atIndex:5]; [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:5];
[encoder setBytes:&nb22 length:sizeof(nb22) atIndex:6]; [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6];
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7]; [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:7];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:8]; [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:8];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9]; [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:9];
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10]; [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11]; [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12]; [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
[encoder setBytes:&r2 length:sizeof(r2) atIndex:13]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
[encoder setBytes:&r3 length:sizeof(r3) atIndex:14]; [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:14];
[encoder setBytes:&idx length:sizeof(idx) atIndex:15]; [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
[encoder setBytes:&r2 length:sizeof(r2) atIndex:16];
[encoder setBytes:&r3 length:sizeof(r3) atIndex:17];
[encoder setBytes:&idx length:sizeof(idx) atIndex:18];
// TODO: how to make this an array? read Metal docs // TODO: how to make this an array? read Metal docs
for (int j = 0; j < n_as; ++j) { for (int j = 0; j < n_as; ++j) {
struct ggml_tensor * src_cur = dst->src[2 + j]; struct ggml_tensor * src_cur = dst->src[2 + j];
@ -1534,11 +1759,157 @@ void ggml_metal_graph_compute(
size_t offs_src_cur = 0; size_t offs_src_cur = 0;
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur); id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:16 + j]; [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
} }
[encoder setThreadgroupMemoryLength:8192 atIndex:0]; [encoder setThreadgroupMemoryLength:8192 atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne21 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
// TODO: processing one row at a time (ne11 -> 1) is not efficient
[encoder dispatchThreadgroups:MTLSizeMake( (_ne1 + 31)/32, (ne21 + 63)/64, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
} else {
int nth0 = 32;
int nth1 = 1;
int nrows = 1;
//printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
// use custom matrix x vector kernel
switch (src2t) {
case GGML_TYPE_F32:
{
GGML_ASSERT(src1t == GGML_TYPE_F32);
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_f32_f32];
} break;
case GGML_TYPE_F16:
{
GGML_ASSERT(src1t == GGML_TYPE_F32);
nth0 = 32;
nth1 = 1;
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_f16_f32];
} break;
case GGML_TYPE_Q4_0:
{
nth0 = 8;
nth1 = 8;
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_0_f32];
} break;
case GGML_TYPE_Q4_1:
{
nth0 = 8;
nth1 = 8;
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_1_f32];
} break;
case GGML_TYPE_Q5_0:
{
nth0 = 8;
nth1 = 8;
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_0_f32];
} break;
case GGML_TYPE_Q5_1:
{
nth0 = 8;
nth1 = 8;
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_1_f32];
} break;
case GGML_TYPE_Q8_0:
{
nth0 = 8;
nth1 = 8;
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q8_0_f32];
} break;
case GGML_TYPE_Q2_K:
{
nth0 = 2;
nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q2_K_f32];
} break;
case GGML_TYPE_Q3_K:
{
nth0 = 2;
nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q3_K_f32];
} break;
case GGML_TYPE_Q4_K:
{
nth0 = 4; //1;
nth1 = 8; //32;
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_K_f32];
} break;
case GGML_TYPE_Q5_K:
{
nth0 = 2;
nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_K_f32];
} break;
case GGML_TYPE_Q6_K:
{
nth0 = 2;
nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q6_K_f32];
} break;
default:
{
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
GGML_ASSERT(false && "not implemented");
}
};
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3];
[encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
[encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5];
[encoder setBytes:&ne22 length:sizeof(ne22) atIndex:6];
[encoder setBytes:&nb20 length:sizeof(nb20) atIndex:7];
[encoder setBytes:&nb21 length:sizeof(nb21) atIndex:8];
[encoder setBytes:&nb22 length:sizeof(nb22) atIndex:9];
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
[encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:11];
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
[encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:17];
[encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:18];
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:19];
[encoder setBytes:&r2 length:sizeof(r2) atIndex:20];
[encoder setBytes:&r3 length:sizeof(r3) atIndex:21];
[encoder setBytes:&idx length:sizeof(idx) atIndex:22];
// TODO: how to make this an array? read Metal docs
for (int j = 0; j < n_as; ++j) {
struct ggml_tensor * src_cur = dst->src[2 + j];
size_t offs_src_cur = 0;
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
}
if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src2t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src2t == GGML_TYPE_Q3_K) {
#ifdef GGML_QKK_64
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
#else
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
#endif
}
else if (src2t == GGML_TYPE_Q5_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src2t == GGML_TYPE_Q6_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else {
const int64_t ny = (_ne1 + nrows - 1)/nrows;
[encoder dispatchThreadgroups:MTLSizeMake(ne21, ny, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
} }
} break; } break;
case GGML_OP_GET_ROWS: case GGML_OP_GET_ROWS:
@ -1564,11 +1935,14 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4]; [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:5]; [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5];
[encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6];
[encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7];
[encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8];
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:9];
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:10];
const int64_t n = ggml_nelements(src1); [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
{ {
@ -1595,6 +1969,38 @@ void ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;
case GGML_OP_GROUP_NORM:
{
GGML_ASSERT(ne00 % 4 == 0);
//float eps;
//memcpy(&eps, dst->op_params, sizeof(float));
const float eps = 1e-6f; // TODO: temporarily hardcoded
const int32_t n_groups = ((int32_t *) dst->op_params)[0];
int nth = 32; // SIMD width
//while (nth < ne00/4 && nth < 1024) {
// nth *= 2;
//}
[encoder setComputePipelineState:ctx->pipeline_group_norm];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:5];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:6];
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:7];
[encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8];
[encoder setBytes:&eps length:sizeof( float) atIndex:9];
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_NORM: case GGML_OP_NORM:
{ {
float eps; float eps;
@ -1764,6 +2170,65 @@ void ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)]; [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
} break; } break;
case GGML_OP_UPSCALE:
{
GGML_ASSERT(src0->type == GGML_TYPE_F32);
const int sf = dst->op_params[0];
[encoder setComputePipelineState:ctx->pipeline_upscale_f32];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
[encoder setBytes:&sf length:sizeof(sf) atIndex:18];
const int nth = MIN(1024, ne0);
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_PAD:
{
GGML_ASSERT(src0->type == GGML_TYPE_F32);
[encoder setComputePipelineState:ctx->pipeline_pad_f32];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
const int nth = MIN(1024, ne0);
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
{ {
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
@ -1785,6 +2250,22 @@ void ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)];
} break; } break;
case GGML_OP_LEAKY_RELU:
{
GGML_ASSERT(src0->type == GGML_TYPE_F32);
float slope;
memcpy(&slope, dst->op_params, sizeof(float));
[encoder setComputePipelineState:ctx->pipeline_leaky_relu_f32];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&slope length:sizeof(slope) atIndex:2];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_DUP: case GGML_OP_DUP:
case GGML_OP_CPY: case GGML_OP_CPY:
case GGML_OP_CONT: case GGML_OP_CONT:
@ -1813,7 +2294,7 @@ void ggml_metal_graph_compute(
{ {
switch (dstt) { switch (dstt) {
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break; case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break;
case GGML_TYPE_F32: GGML_ASSERT(false && "cpy_f16_f32 not implemented"); break; case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f32]; break;
default: GGML_ASSERT(false && "not implemented"); default: GGML_ASSERT(false && "not implemented");
}; };
} break; } break;

File diff suppressed because it is too large Load diff

View file

@ -3114,7 +3114,7 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
size_t vl = __riscv_vsetvl_e8m1(qk/2); size_t vl = __riscv_vsetvl_e8m1(qk/2);
// These tempory registers are for masking and shift operations // These temporary registers are for masking and shift operations
vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl); vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl); vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
@ -4757,7 +4757,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
vl = 16; vl = 16;
// retreive lane to multiply with scale // retrieve lane to multiply with scale
vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl); vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl); vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl); vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);

637
ggml.c

File diff suppressed because it is too large Load diff

43
ggml.h
View file

@ -215,9 +215,9 @@
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
#define GGML_MAX_DIMS 4 #define GGML_MAX_DIMS 4
#define GGML_MAX_PARAMS 1024 #define GGML_MAX_PARAMS 2048
#define GGML_MAX_CONTEXTS 64 #define GGML_MAX_CONTEXTS 64
#define GGML_MAX_SRC 6 #define GGML_MAX_SRC 10
#define GGML_MAX_NAME 64 #define GGML_MAX_NAME 64
#define GGML_MAX_OP_PARAMS 64 #define GGML_MAX_OP_PARAMS 64
#define GGML_DEFAULT_N_THREADS 4 #define GGML_DEFAULT_N_THREADS 4
@ -423,7 +423,9 @@ extern "C" {
GGML_OP_POOL_1D, GGML_OP_POOL_1D,
GGML_OP_POOL_2D, GGML_OP_POOL_2D,
GGML_OP_UPSCALE, // nearest interpolate GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_ARGSORT, GGML_OP_ARGSORT,
GGML_OP_LEAKY_RELU,
GGML_OP_FLASH_ATTN, GGML_OP_FLASH_ATTN,
GGML_OP_FLASH_FF, GGML_OP_FLASH_FF,
@ -463,7 +465,6 @@ extern "C" {
GGML_UNARY_OP_GELU, GGML_UNARY_OP_GELU,
GGML_UNARY_OP_GELU_QUICK, GGML_UNARY_OP_GELU_QUICK,
GGML_UNARY_OP_SILU, GGML_UNARY_OP_SILU,
GGML_UNARY_OP_LEAKY,
GGML_UNARY_OP_COUNT, GGML_UNARY_OP_COUNT,
}; };
@ -501,7 +502,6 @@ extern "C" {
struct ggml_backend_buffer * buffer; struct ggml_backend_buffer * buffer;
int n_dims;
int64_t ne[GGML_MAX_DIMS]; // number of elements int64_t ne[GGML_MAX_DIMS]; // number of elements
size_t nb[GGML_MAX_DIMS]; // stride in bytes: size_t nb[GGML_MAX_DIMS]; // stride in bytes:
// nb[0] = ggml_type_size(type) // nb[0] = ggml_type_size(type)
@ -533,7 +533,7 @@ extern "C" {
void * extra; // extra things e.g. for ggml-cuda.cu void * extra; // extra things e.g. for ggml-cuda.cu
char padding[12]; char padding[8];
}; };
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -638,11 +638,14 @@ extern "C" {
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
GGML_API int ggml_blck_size(enum ggml_type type); GGML_API int ggml_blck_size(enum ggml_type type);
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
GGML_DEPRECATED(
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
"use ggml_row_size() instead");
GGML_API const char * ggml_type_name(enum ggml_type type); GGML_API const char * ggml_type_name(enum ggml_type type);
GGML_API const char * ggml_op_name (enum ggml_op op); GGML_API const char * ggml_op_name (enum ggml_op op);
@ -661,6 +664,11 @@ extern "C" {
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor); GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor); GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1); GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
@ -793,6 +801,9 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// dst = a
// view(dst, nb1, nb2, nb3, offset) += b
// return dst
GGML_API struct ggml_tensor * ggml_acc( GGML_API struct ggml_tensor * ggml_acc(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -957,15 +968,14 @@ extern "C" {
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_leaky( GGML_API struct ggml_tensor * ggml_leaky_relu(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a, float negative_slope, bool inplace);
GGML_API struct ggml_tensor * ggml_relu_inplace( GGML_API struct ggml_tensor * ggml_relu_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// TODO: double-check this computation is correct
GGML_API struct ggml_tensor * ggml_gelu( GGML_API struct ggml_tensor * ggml_gelu(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
@ -1051,7 +1061,8 @@ extern "C" {
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b) // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
GGML_API struct ggml_tensor * ggml_mul_mat_id( GGML_API struct ggml_tensor * ggml_mul_mat_id(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * as[], struct ggml_tensor * const as[],
int n_as,
struct ggml_tensor * ids, struct ggml_tensor * ids,
int id, int id,
struct ggml_tensor * b); struct ggml_tensor * b);
@ -1263,6 +1274,7 @@ extern "C" {
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// supports 3D: a->ne[2] == b->ne[1]
GGML_API struct ggml_tensor * ggml_get_rows( GGML_API struct ggml_tensor * ggml_get_rows(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -1549,6 +1561,15 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
int scale_factor); int scale_factor);
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
GGML_API struct ggml_tensor * ggml_pad(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p0,
int p1,
int p2,
int p3);
// sort rows // sort rows
enum ggml_sort_order { enum ggml_sort_order {
GGML_SORT_ASC, GGML_SORT_ASC,

View file

@ -61,7 +61,7 @@ If you want to publish the package manually for any reason, you need to have `tw
pip install build twine pip install build twine
``` ```
Then, folow these steps to release a new version: Then, follow these steps to release a new version:
1. Bump the version in `pyproject.toml`. 1. Bump the version in `pyproject.toml`.
2. Build the package: 2. Build the package:

View file

@ -38,6 +38,8 @@ class Keys:
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
EXPERT_COUNT = "{arch}.expert_count"
EXPERT_USED_COUNT = "{arch}.expert_used_count"
class Attention: class Attention:
HEAD_COUNT = "{arch}.attention.head_count" HEAD_COUNT = "{arch}.attention.head_count"
@ -111,10 +113,14 @@ class MODEL_TENSOR(IntEnum):
ATTN_NORM = auto() ATTN_NORM = auto()
ATTN_NORM_2 = auto() ATTN_NORM_2 = auto()
ATTN_ROT_EMBD = auto() ATTN_ROT_EMBD = auto()
FFN_GATE_INP = auto()
FFN_NORM = auto()
FFN_GATE = auto() FFN_GATE = auto()
FFN_DOWN = auto() FFN_DOWN = auto()
FFN_UP = auto() FFN_UP = auto()
FFN_NORM = auto() FFN_GATE_EXP = auto()
FFN_DOWN_EXP = auto()
FFN_UP_EXP = auto()
ATTN_Q_NORM = auto() ATTN_Q_NORM = auto()
ATTN_K_NORM = auto() ATTN_K_NORM = auto()
@ -154,10 +160,14 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}",
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}",
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}",
} }
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@ -172,10 +182,14 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.ATTN_V, MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_ROT_EMBD, MODEL_TENSOR.ATTN_ROT_EMBD,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
], ],
MODEL_ARCH.GPTNEOX: [ MODEL_ARCH.GPTNEOX: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,

View file

@ -339,6 +339,12 @@ class GGUFWriter:
def add_clamp_kqv(self, value: float) -> None: def add_clamp_kqv(self, value: float) -> None:
self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value) self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
def add_expert_count(self, count: int) -> None:
self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
def add_expert_used_count(self, count: int) -> None:
self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count)
def add_layer_norm_eps(self, value: float) -> None: def add_layer_norm_eps(self, value: float) -> None:
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value) self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)

View file

@ -149,6 +149,11 @@ class TensorNameMap:
"model.layers.{bid}.ln2", # yi "model.layers.{bid}.ln2", # yi
), ),
MODEL_TENSOR.FFN_GATE_INP: (
"layers.{bid}.feed_forward.gate", # mixtral
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
),
# Feed-forward up # Feed-forward up
MODEL_TENSOR.FFN_UP: ( MODEL_TENSOR.FFN_UP: (
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
@ -164,6 +169,11 @@ class TensorNameMap:
"transformer.h.{bid}.mlp.w1", # qwen "transformer.h.{bid}.mlp.w1", # qwen
), ),
MODEL_TENSOR.FFN_UP_EXP: (
"layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
),
# Feed-forward gate # Feed-forward gate
MODEL_TENSOR.FFN_GATE: ( MODEL_TENSOR.FFN_GATE: (
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact "model.layers.{bid}.mlp.gate_proj", # llama-hf refact
@ -171,6 +181,11 @@ class TensorNameMap:
"transformer.h.{bid}.mlp.w2", # qwen "transformer.h.{bid}.mlp.w2", # qwen
), ),
MODEL_TENSOR.FFN_GATE_EXP: (
"layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral
),
# Feed-forward down # Feed-forward down
MODEL_TENSOR.FFN_DOWN: ( MODEL_TENSOR.FFN_DOWN: (
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
@ -185,6 +200,11 @@ class TensorNameMap:
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
), ),
MODEL_TENSOR.FFN_DOWN_EXP: (
"layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
),
MODEL_TENSOR.ATTN_Q_NORM: ( MODEL_TENSOR.ATTN_Q_NORM: (
"language_model.encoder.layers.{bid}.self_attention.q_layernorm", "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
), ),
@ -213,10 +233,13 @@ class TensorNameMap:
for tensor, keys in self.block_mappings_cfg.items(): for tensor, keys in self.block_mappings_cfg.items():
if tensor not in MODEL_TENSORS[arch]: if tensor not in MODEL_TENSORS[arch]:
continue continue
tensor_name = TENSOR_NAMES[tensor].format(bid = bid) # TODO: make this configurable
n_experts = 8
for xid in range(n_experts):
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
self.mapping[tensor_name] = (tensor, tensor_name) self.mapping[tensor_name] = (tensor, tensor_name)
for key in keys: for key in keys:
key = key.format(bid = bid) key = key.format(bid = bid, xid = xid)
self.mapping[key] = (tensor, tensor_name) self.mapping[key] = (tensor, tensor_name)
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None: def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "gguf" name = "gguf"
version = "0.6.0" version = "0.7.0"
description = "Read and write ML models in GGUF for GGML" description = "Read and write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"] authors = ["GGML <ggml@ggml.ai>"]
packages = [ packages = [

200
llama.cpp
View file

@ -92,6 +92,7 @@
#endif #endif
#define LLAMA_MAX_NODES 8192 #define LLAMA_MAX_NODES 8192
#define LLAMA_MAX_EXPERTS 8
// //
// logging // logging
@ -231,6 +232,8 @@ enum llm_kv {
LLM_KV_FEED_FORWARD_LENGTH, LLM_KV_FEED_FORWARD_LENGTH,
LLM_KV_USE_PARALLEL_RESIDUAL, LLM_KV_USE_PARALLEL_RESIDUAL,
LLM_KV_TENSOR_DATA_LAYOUT, LLM_KV_TENSOR_DATA_LAYOUT,
LLM_KV_EXPERT_COUNT,
LLM_KV_EXPERT_USED_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV, LLM_KV_ATTENTION_HEAD_COUNT_KV,
@ -281,6 +284,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" }, { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" }, { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@ -338,10 +343,14 @@ enum llm_tensor {
LLM_TENSOR_ATTN_NORM, LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_NORM_2, LLM_TENSOR_ATTN_NORM_2,
LLM_TENSOR_ATTN_ROT_EMBD, LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_GATE, LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP, LLM_TENSOR_FFN_UP,
LLM_TENSOR_FFN_NORM, LLM_TENSOR_FFN_DOWN_EXP,
LLM_TENSOR_FFN_GATE_EXP,
LLM_TENSOR_FFN_UP_EXP,
LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM, LLM_TENSOR_ATTN_K_NORM,
}; };
@ -360,10 +369,14 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
}, },
}, },
{ {
@ -585,6 +598,10 @@ struct LLM_TN {
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const { std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix; return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
} }
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
}
}; };
// //
@ -1164,6 +1181,8 @@ struct llama_hparams {
uint32_t n_layer; uint32_t n_layer;
uint32_t n_rot; uint32_t n_rot;
uint32_t n_ff; uint32_t n_ff;
uint32_t n_expert = 0;
uint32_t n_expert_used = 0;
float f_norm_eps; float f_norm_eps;
float f_norm_rms_eps; float f_norm_rms_eps;
@ -1187,6 +1206,9 @@ struct llama_hparams {
if (this->n_layer != other.n_layer) return true; if (this->n_layer != other.n_layer) return true;
if (this->n_rot != other.n_rot) return true; if (this->n_rot != other.n_rot) return true;
if (this->n_ff != other.n_ff) return true; if (this->n_ff != other.n_ff) return true;
if (this->n_expert != other.n_expert) return true;
if (this->n_expert_used != other.n_expert_used) return true;
if (this->rope_finetuned != other.rope_finetuned) return true; if (this->rope_finetuned != other.rope_finetuned) return true;
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true; if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
@ -1268,6 +1290,12 @@ struct llama_layer {
struct ggml_tensor * ffn_down; // w2 struct ggml_tensor * ffn_down; // w2
struct ggml_tensor * ffn_up; // w3 struct ggml_tensor * ffn_up; // w3
// ff MoE
struct ggml_tensor * ffn_gate_inp;
struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
// ff bias // ff bias
struct ggml_tensor * ffn_down_b; // b2 struct ggml_tensor * ffn_down_b; // b2
struct ggml_tensor * ffn_up_b; // b3 struct ggml_tensor * ffn_up_b; // b3
@ -1527,7 +1555,7 @@ static bool llama_kv_cache_init(
cache.cells.clear(); cache.cells.clear();
cache.cells.resize(n_ctx); cache.cells.resize(n_ctx);
cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead()); cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead());
memset(cache.buf.data, 0, cache.buf.size); memset(cache.buf.data, 0, cache.buf.size);
struct ggml_init_params params; struct ggml_init_params params;
@ -2440,6 +2468,16 @@ static void llm_load_hparams(
ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff); ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head); ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer); ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
if (hparams.n_expert > 0) {
GGML_ASSERT(hparams.n_expert_used > 0);
} else {
GGML_ASSERT(hparams.n_expert_used == 0);
}
// n_head_kv is optional, default to n_head // n_head_kv is optional, default to n_head
hparams.n_head_kv = hparams.n_head; hparams.n_head_kv = hparams.n_head;
@ -2758,7 +2796,7 @@ static void llm_load_vocab(
// The assumption is, since special tokens aren't meant to be exposed to end user, they are designed // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
// to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer // to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
// are special tokens. // are special tokens.
// From testing, this appears to corelate 1:1 with special tokens. // From testing, this appears to correlate 1:1 with special tokens.
// //
// Counting special tokens and verifying in only one direction // Counting special tokens and verifying in only one direction
@ -2871,6 +2909,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias); LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str()); LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@ -3025,9 +3065,26 @@ static void llm_load_tensors(
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
if (layer.ffn_gate_inp == nullptr) {
GGML_ASSERT(hparams.n_expert == 0);
GGML_ASSERT(hparams.n_expert_used == 0);
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
} else {
GGML_ASSERT(hparams.n_expert > 0);
GGML_ASSERT(hparams.n_expert_used > 0);
// MoE branch
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split);
layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
}
}
if (backend == GGML_BACKEND_GPU) { if (backend == GGML_BACKEND_GPU) {
vram_weights += vram_weights +=
@ -3037,8 +3094,18 @@ static void llm_load_tensors(
(layer.bk ? ggml_nbytes(layer.bk) : 0) + (layer.bk ? ggml_nbytes(layer.bk) : 0) +
(layer.bv ? ggml_nbytes(layer.bv) : 0) + (layer.bv ? ggml_nbytes(layer.bv) : 0) +
(layer.bo ? ggml_nbytes(layer.bo) : 0) + (layer.bo ? ggml_nbytes(layer.bo) : 0) +
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_norm);
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
if (layer.ffn_gate_inp == nullptr) {
vram_weights +=
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
} else {
vram_weights += ggml_nbytes(layer.ffn_gate_inp);
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
vram_weights +=
ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]);
}
}
} }
} }
} break; } break;
@ -3755,8 +3822,8 @@ static void llm_build_k_shift(
ggml_rope_custom_inplace(ctx, ggml_rope_custom_inplace(ctx,
ggml_view_3d(ctx, kv.k_l[il], ggml_view_3d(ctx, kv.k_l[il],
n_embd_head, n_head_kv, n_ctx, n_embd_head, n_head_kv, n_ctx,
ggml_type_sizef(kv.k_l[il]->type)*n_embd_head, ggml_row_size(kv.k_l[il]->type, n_embd_head),
ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa, ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
0), 0),
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
@ -3785,7 +3852,7 @@ static void llm_build_kv_store(
cb(v_cur_t, "v_cur_t", il); cb(v_cur_t, "v_cur_t", il);
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa, struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
(ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head); (ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
cb(k_cache_view, "k_cache_view", il); cb(k_cache_view, "k_cache_view", il);
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa, struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
@ -3944,8 +4011,8 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * k = struct ggml_tensor * k =
ggml_view_3d(ctx, kv.k_l[il], ggml_view_3d(ctx, kv.k_l[il],
n_embd_head, n_kv, n_head_kv, n_embd_head, n_kv, n_head_kv,
ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa, ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
ggml_type_sizef(kv.k_l[il]->type)*n_embd_head, ggml_row_size(kv.k_l[il]->type, n_embd_head),
0); 0);
cb(k, "k", il); cb(k, "k", il);
@ -4019,6 +4086,8 @@ struct llm_build_context {
const int64_t n_head_kv; const int64_t n_head_kv;
const int64_t n_embd_head; const int64_t n_embd_head;
const int64_t n_embd_gqa; const int64_t n_embd_gqa;
const int64_t n_expert;
const int64_t n_expert_used;
const float freq_base; const float freq_base;
const float freq_scale; const float freq_scale;
@ -4060,6 +4129,8 @@ struct llm_build_context {
n_head_kv (hparams.n_head_kv), n_head_kv (hparams.n_head_kv),
n_embd_head (hparams.n_embd_head()), n_embd_head (hparams.n_embd_head()),
n_embd_gqa (hparams.n_embd_gqa()), n_embd_gqa (hparams.n_embd_gqa()),
n_expert (hparams.n_expert),
n_expert_used (hparams.n_expert_used),
freq_base (cparams.rope_freq_base), freq_base (cparams.rope_freq_base),
freq_scale (cparams.rope_freq_scale), freq_scale (cparams.rope_freq_scale),
ext_factor (cparams.yarn_ext_factor), ext_factor (cparams.yarn_ext_factor),
@ -4184,7 +4255,7 @@ struct llm_build_context {
cb(ffn_inp, "ffn_inp", il); cb(ffn_inp, "ffn_inp", il);
// feed-forward network // feed-forward network
{ if (model.layers[il].ffn_gate_inp == nullptr) {
cur = llm_build_norm(ctx0, ffn_inp, hparams, cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL, model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il); LLM_NORM_RMS, cb, il);
@ -4196,6 +4267,69 @@ struct llm_build_context {
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
} else {
// MoE branch
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
cb(logits, "ffn_moe_logits", il);
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
cb(probs, "ffn_moe_probs", il);
// select experts
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
cb(selected_experts->src[0], "ffn_moe_argsort", il);
ggml_tensor * weights = ggml_get_rows(ctx0,
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
cb(weights, "ffn_moe_weights", il);
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
cb(weights_sum, "ffn_moe_weights_sum", il);
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
cb(weights, "ffn_moe_weights_norm", il);
// compute expert outputs
ggml_tensor * moe_out = nullptr;
for (int i = 0; i < n_expert_used; ++i) {
ggml_tensor * cur_expert;
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
cb(cur_up, "ffn_moe_up", il);
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
cb(cur_gate, "ffn_moe_gate", il);
cur_gate = ggml_silu(ctx0, cur_gate);
cb(cur_gate, "ffn_moe_silu", il);
cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
cb(cur_expert, "ffn_moe_gate_par", il);
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
cb(cur_expert, "ffn_moe_down", il);
cur_expert = ggml_mul(ctx0, cur_expert,
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
cb(cur_expert, "ffn_moe_weighted", il);
if (i == 0) {
moe_out = cur_expert;
} else {
moe_out = ggml_add(ctx0, moe_out, cur_expert);
cb(moe_out, "ffn_moe_out", il);
}
}
cur = moe_out;
} }
cur = ggml_add(ctx0, cur, ffn_inp); cur = ggml_add(ctx0, cur, ffn_inp);
@ -5450,6 +5584,20 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
{ "ffn_relu", OFFLOAD_FUNC }, { "ffn_relu", OFFLOAD_FUNC },
{ "ffn_sqr(relu)", OFFLOAD_FUNC }, { "ffn_sqr(relu)", OFFLOAD_FUNC },
{ "ffn_moe_logits", OFFLOAD_FUNC },
{ "ffn_moe_probs", OFFLOAD_FUNC },
{ "ffn_moe_argsort", OFFLOAD_FUNC },
{ "ffn_moe_weights", OFFLOAD_FUNC },
{ "ffn_moe_weights_sum", OFFLOAD_FUNC },
{ "ffn_moe_weights_norm", OFFLOAD_FUNC },
{ "ffn_moe_weighted", OFFLOAD_FUNC },
{ "ffn_moe_up", OFFLOAD_FUNC },
{ "ffn_moe_gate", OFFLOAD_FUNC },
{ "ffn_moe_silu", OFFLOAD_FUNC },
{ "ffn_moe_gate_par", OFFLOAD_FUNC },
{ "ffn_moe_down", OFFLOAD_FUNC },
{ "ffn_moe_out", OFFLOAD_FUNC },
{ "l_out", OFFLOAD_FUNC }, { "l_out", OFFLOAD_FUNC },
{ "result_norm", OFFLOAD_FUNC_EMB }, { "result_norm", OFFLOAD_FUNC_EMB },
@ -5846,7 +5994,7 @@ static int llama_decode_internal(
const int64_t n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd;
const int64_t n_vocab = hparams.n_vocab; const int64_t n_vocab = hparams.n_vocab;
// helpers for smoother batch API transistion // helpers for smoother batch API transition
// after deprecating the llama_eval calls, these will be removed // after deprecating the llama_eval calls, these will be removed
std::vector<llama_pos> pos; std::vector<llama_pos> pos;
@ -6625,12 +6773,12 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
// loop over the text // loop over the text
while (true) { while (true) {
// find the first occurence of a given special token in this fragment // find the first occurrence of a given special token in this fragment
// passing offset argument only limit the "search area" but match coordinates // passing offset argument only limit the "search area" but match coordinates
// are still relative to the source full raw_text // are still relative to the source full raw_text
auto match = raw_text->find(special_token, raw_text_base_offset); auto match = raw_text->find(special_token, raw_text_base_offset);
// no occurences found, stop processing this fragment for a given special token // no occurrences found, stop processing this fragment for a given special token
if (match == std::string::npos) break; if (match == std::string::npos) break;
// check if match is within bounds of offset <-> length // check if match is within bounds of offset <-> length
@ -7829,7 +7977,7 @@ struct llama_beam_search_data {
} }
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams). // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
// The repetative patterns below reflect the 2 stages of heaps: // The repetitive patterns below reflect the 2 stages of heaps:
// * Gather elements until the vector is full, then call std::make_heap() on it. // * Gather elements until the vector is full, then call std::make_heap() on it.
// * If the heap is full and a new element is found that should be included, pop the // * If the heap is full and a new element is found that should be included, pop the
// least element to the back(), replace it with the new, then push it into the heap. // least element to the back(), replace it with the new, then push it into the heap.
@ -8067,11 +8215,9 @@ static void llama_convert_tensor_internal(
workers.clear(); workers.clear();
} }
static ggml_type get_k_quant_type( static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
quantize_state_internal & qs,
ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
) {
const std::string name = ggml_get_name(tensor); const std::string name = ggml_get_name(tensor);
// TODO: avoid hardcoded tensor names - use the TN_* constants // TODO: avoid hardcoded tensor names - use the TN_* constants
const llm_arch arch = qs.model.arch; const llm_arch arch = qs.model.arch;
const auto tn = LLM_TN(arch); const auto tn = LLM_TN(arch);
@ -8105,7 +8251,18 @@ static ggml_type get_k_quant_type(
// nearly negligible increase in model size by quantizing this tensor with more bits: // nearly negligible increase in model size by quantizing this tensor with more bits:
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K; if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
} }
if (qs.model.hparams.n_expert == 8) {
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
// TODO: explore better strategies
new_type = GGML_TYPE_Q8_0;
}
++qs.i_attention_wv; ++qs.i_attention_wv;
} else if (name.find("attn_k.weight") != std::string::npos) {
if (qs.model.hparams.n_expert == 8) {
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
// TODO: explore better strategies
new_type = GGML_TYPE_Q8_0;
}
} else if (name.find("ffn_down.weight") != std::string::npos) { } else if (name.find("ffn_down.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@ -8314,10 +8471,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
// quantize only 2D tensors // quantize only 2D tensors
quantize &= (tensor->n_dims == 2); quantize &= (ggml_n_dims(tensor) == 2);
quantize &= params->quantize_output_tensor || name != "output.weight"; quantize &= params->quantize_output_tensor || name != "output.weight";
quantize &= !params->only_copy; quantize &= !params->only_copy;
// do not quantize expert gating tensors
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
enum ggml_type new_type; enum ggml_type new_type;
void * new_data; void * new_data;
size_t new_size; size_t new_size;

View file

@ -216,7 +216,7 @@ extern "C" {
// Keep the booleans together to avoid misalignment during copy-by-value. // Keep the booleans together to avoid misalignment during copy-by-value.
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
bool logits_all; // the llama_eval() call computes all logits, not just the last one bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
bool embedding; // embedding mode only bool embedding; // embedding mode only
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
}; };

View file

@ -1,3 +1,5 @@
numpy==1.24.4 numpy==1.24.4
sentencepiece==0.1.98 sentencepiece==0.1.98
transformers>=4.34.0
gguf>=0.1.0 gguf>=0.1.0
protobuf>=4.21.0

38
scripts/get-flags.mk Normal file
View file

@ -0,0 +1,38 @@
ifeq '' '$(findstring clang,$(shell $(GF_CC) --version))'
GF_CC_IS_GCC = 1
GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null || $(GF_CC) -dumpversion; } | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
else
GF_CC_IS_CLANG = 1
ifeq '' '$(findstring Apple,$(shell $(GF_CC) --version))'
GF_CC_IS_LLVM_CLANG = 1
else
GF_CC_IS_APPLE_CLANG = 1
endif
GF_CC_VER := \
$(shell $(GF_CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
| awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
endif
ifeq ($(GF_CC_IS_CLANG), 1)
# clang options
GF_CFLAGS = -Wunreachable-code-break -Wunreachable-code-return
GF_CXXFLAGS = -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
ifneq '' '$(and $(GF_CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(GF_CC_VER) \>= 030800)))'
GF_CFLAGS += -Wdouble-promotion
endif
ifneq '' '$(and $(GF_CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(GF_CC_VER) \>= 070300)))'
GF_CFLAGS += -Wdouble-promotion
endif
else
# gcc options
GF_CFLAGS = -Wdouble-promotion
GF_CXXFLAGS = -Wno-array-bounds
ifeq ($(shell expr $(GF_CC_VER) \>= 070100), 1)
GF_CXXFLAGS += -Wno-format-truncation
endif
ifeq ($(shell expr $(GF_CC_VER) \>= 080100), 1)
GF_CXXFLAGS += -Wextra-semi
endif
endif

View file

@ -20,8 +20,6 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
size_t size = ggml_nelements(tensor); size_t size = ggml_nelements(tensor);
std::vector<float> data(size); std::vector<float> data(size);
std::random_device rd;
#if 0 #if 0
std::default_random_engine generator(rd()); std::default_random_engine generator(rd());
std::uniform_real_distribution<float> distribution(min, max); std::uniform_real_distribution<float> distribution(min, max);
@ -31,6 +29,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
} }
#endif #endif
auto init_thread = [&](size_t start, size_t end) { auto init_thread = [&](size_t start, size_t end) {
std::random_device rd;
std::default_random_engine generator(rd()); std::default_random_engine generator(rd());
std::uniform_real_distribution<float> distribution(min, max); std::uniform_real_distribution<float> distribution(min, max);
@ -51,11 +50,11 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
t.join(); t.join();
} }
if (tensor->type == GGML_TYPE_F32) { if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) { } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) {
GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
std::vector<uint8_t> dataq(ggml_type_size(tensor->type)*size/ggml_blck_size(tensor->type)); std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
int64_t hist[16]; int64_t hist[16];
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size, hist); ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size, hist);
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
@ -71,23 +70,29 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
std::vector<uint8_t> buf(ggml_nbytes(t)); std::vector<uint8_t> buf(ggml_nbytes(t));
ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t)); ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
size_t bs = ggml_blck_size(t->type);
std::vector<float> vq(ggml_blck_size(t->type));
bool quantized = ggml_is_quantized(t->type);
// access elements by index to avoid gaps in views // access elements by index to avoid gaps in views
for (int64_t i3 = 0; i3 < t->ne[3]; i3++) { for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
for (int64_t i2 = 0; i2 < t->ne[2]; i2++) { for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
for (int64_t i1 = 0; i1 < t->ne[1]; i1++) { for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
for (int64_t i0 = 0; i0 < t->ne[0]; i0++) { for (int64_t i0 = 0; i0 < t->ne[0]; i0 += bs) {
size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0]; size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
float v;
if (t->type == GGML_TYPE_F16) { if (t->type == GGML_TYPE_F16) {
v = (float) ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]); tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
} else if (t->type == GGML_TYPE_F32) { } else if (t->type == GGML_TYPE_F32) {
v = *(float *) &buf[i]; tv.push_back(*(float *) &buf[i]);
} else if (t->type == GGML_TYPE_I32) { } else if (t->type == GGML_TYPE_I32) {
v = *(int32_t *) &buf[i]; tv.push_back((float)*(int32_t *) &buf[i]);
} else if (quantized) {
tt.to_float(&buf[i], vq.data(), bs);
tv.insert(tv.end(), vq.begin(), vq.end());
} else { } else {
GGML_ASSERT(false); GGML_ASSERT(false);
} }
tv.push_back(v);
} }
} }
} }
@ -230,9 +235,18 @@ static bool ggml_is_view_op(enum ggml_op op) {
return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE; return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
} }
enum test_mode {
MODE_TEST,
MODE_PERF,
};
struct test_case { struct test_case {
virtual ~test_case() {} virtual ~test_case() {}
virtual std::string op_desc(ggml_tensor * t) {
return ggml_op_desc(t);
}
virtual std::string vars() { virtual std::string vars() {
return ""; return "";
} }
@ -240,7 +254,7 @@ struct test_case {
virtual ggml_tensor * build_graph(ggml_context * ctx) = 0; virtual ggml_tensor * build_graph(ggml_context * ctx) = 0;
virtual double max_nmse_err() { virtual double max_nmse_err() {
return 1e-6; return 1e-7;
} }
virtual void initialize_tensors(ggml_context * ctx) { virtual void initialize_tensors(ggml_context * ctx) {
@ -260,7 +274,58 @@ struct test_case {
return size; return size;
} }
ggml_cgraph * gf = nullptr;
static const int sentinel_size = 1024;
test_mode mode;
std::vector<ggml_tensor *> sentinels;
void add_sentinel(ggml_context * ctx) {
if (mode == MODE_PERF) {
return;
}
ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size);
ggml_format_name(sentinel, "sent_%zu", sentinels.size());
sentinels.push_back(sentinel);
}
// hijack ggml_new_tensor to add sentinels after each tensor to check for overflows in the backend
ggml_tensor * ggml_new_tensor(ggml_context * ctx, ggml_type type, int n_dims, const int64_t * ne) {
ggml_tensor * t = ::ggml_new_tensor(ctx, type, n_dims, ne);
add_sentinel(ctx);
return t;
}
ggml_tensor * ggml_new_tensor_1d(ggml_context * ctx, ggml_type type, int64_t ne0) {
ggml_tensor * t = ::ggml_new_tensor_1d(ctx, type, ne0);
add_sentinel(ctx);
return t;
}
ggml_tensor * ggml_new_tensor_2d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1) {
ggml_tensor * t = ::ggml_new_tensor_2d(ctx, type, ne0, ne1);
add_sentinel(ctx);
return t;
}
ggml_tensor * ggml_new_tensor_3d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) {
ggml_tensor * t = ::ggml_new_tensor_3d(ctx, type, ne0, ne1, ne2);
add_sentinel(ctx);
return t;
}
ggml_tensor * ggml_new_tensor_4d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
ggml_tensor * t = ::ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3);
add_sentinel(ctx);
return t;
}
bool eval(ggml_backend_t backend1, ggml_backend_t backend2, const char * op_name) { bool eval(ggml_backend_t backend1, ggml_backend_t backend2, const char * op_name) {
mode = MODE_TEST;
ggml_init_params params = { ggml_init_params params = {
/* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(), /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
/* .mem_base = */ NULL, /* .mem_base = */ NULL,
@ -268,15 +333,20 @@ struct test_case {
}; };
ggml_context * ctx = ggml_init(params); ggml_context * ctx = ggml_init(params);
gf = ggml_new_graph(ctx);
// pre-graph sentinel
add_sentinel(ctx);
ggml_tensor * out = build_graph(ctx); ggml_tensor * out = build_graph(ctx);
if (op_name != nullptr && strcmp(ggml_op_desc(out), op_name) != 0) { if (op_name != nullptr && op_desc(out) != op_name) {
//printf(" %s: skipping\n", ggml_op_desc(out)); //printf(" %s: skipping\n", op_desc(out).c_str());
ggml_free(ctx); ggml_free(ctx);
return true; return true;
} }
printf(" %s(%s): ", ggml_op_desc(out), vars().c_str()); printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
fflush(stdout); fflush(stdout);
// check if backends support op // check if backends support op
@ -288,13 +358,20 @@ struct test_case {
} }
} }
// post-graph sentinel
add_sentinel(ctx);
// allocate // allocate
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1); ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
// build graph // build graph
ggml_cgraph * gf = ggml_new_graph(ctx);
ggml_build_forward_expand(gf, out); ggml_build_forward_expand(gf, out);
// add sentinels as graph nodes so that they are checked in the callback
for (ggml_tensor * sentinel : sentinels) {
gf->nodes[gf->n_nodes++] = sentinel;
}
// randomize tensors // randomize tensors
initialize_tensors(ctx); initialize_tensors(ctx);
@ -310,14 +387,29 @@ struct test_case {
}; };
auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool { auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
callback_userdata * ud = (callback_userdata *) user_data;
if (t1->op == GGML_OP_NONE) {
// sentinels must be unchanged
std::vector<uint8_t> t1_data(ggml_nbytes(t1));
std::vector<uint8_t> t2_data(ggml_nbytes(t2));
ggml_backend_tensor_get(t1, t1_data.data(), 0, ggml_nbytes(t1));
ggml_backend_tensor_get(t2, t2_data.data(), 0, ggml_nbytes(t2));
if (memcmp(t1_data.data(), t2_data.data(), ggml_nbytes(t1)) != 0) {
printf("sentinel mismatch: %s ", t1->name);
ud->ok = false;
return true;
}
}
std::vector<float> f1 = tensor_to_float(t1); std::vector<float> f1 = tensor_to_float(t1);
std::vector<float> f2 = tensor_to_float(t2); std::vector<float> f2 = tensor_to_float(t2);
callback_userdata * ud = (callback_userdata *) user_data;
for (size_t i = 0; i < f1.size(); i++) { for (size_t i = 0; i < f1.size(); i++) {
// check for nans // check for nans
if (std::isnan(f1[i]) || std::isnan(f2[i])) { if (std::isnan(f1[i]) || std::isnan(f2[i])) {
printf("NaN at index %zu ", i); printf("[%s] NaN at index %zu (%f %f) ", ggml_op_desc(t1), i, f1[i], f2[i]);
ud->ok = false; ud->ok = false;
return true; return true;
} }
@ -325,12 +417,12 @@ struct test_case {
if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) { if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) {
if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) { if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) {
if (std::signbit(f1[i]) != std::signbit(f2[i])) { if (std::signbit(f1[i]) != std::signbit(f2[i])) {
printf("inf sign mismatch: %f %f ", f1[i], f2[i]); printf("[%s] inf sign mismatch: %f %f ", ggml_op_desc(t1), f1[i], f2[i]);
ud->ok = false; ud->ok = false;
return true; return true;
} }
} else { } else {
printf("inf mismatch: %f %f ", f1[i], f2[i]); printf("[%s] inf mismatch: %f %f ", ggml_op_desc(t1), f1[i], f2[i]);
ud->ok = false; ud->ok = false;
return true; return true;
} }
@ -339,10 +431,17 @@ struct test_case {
double err = nmse(f1.data(), f2.data(), f1.size()); double err = nmse(f1.data(), f2.data(), f1.size());
if (err > ud->max_err) { if (err > ud->max_err) {
printf("NMSE = %f ", err); printf("[%s] NMSE = %f ", ggml_op_desc(t1), err);
//for (int i = 0; i < f1.size(); i++) {
// printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
//}
//printf("\n");
//exit(1);
ud->ok = false; ud->ok = false;
} }
return true; return true;
GGML_UNUSED(index);
}; };
ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud); ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
@ -361,6 +460,8 @@ struct test_case {
} }
bool eval_perf(ggml_backend_t backend, const char * op_name) { bool eval_perf(ggml_backend_t backend, const char * op_name) {
mode = MODE_PERF;
static const size_t graph_nodes = 8192; static const size_t graph_nodes = 8192;
ggml_init_params params = { ggml_init_params params = {
@ -372,13 +473,13 @@ struct test_case {
ggml_tensor * out = build_graph(ctx); ggml_tensor * out = build_graph(ctx);
if (op_name != nullptr && strcmp(ggml_op_desc(out), op_name) != 0) { if (op_name != nullptr && op_desc(out) != op_name) {
//printf(" %s: skipping\n", ggml_op_desc(out)); //printf(" %s: skipping\n", op_desc(out).c_str());
ggml_free(ctx); ggml_free(ctx);
return true; return true;
} }
int len = printf(" %s(%s): ", ggml_op_desc(out), vars().c_str()); int len = printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
fflush(stdout); fflush(stdout);
// check if backends support op // check if backends support op
@ -430,8 +531,9 @@ struct test_case {
return size; return size;
}; };
for (int i = 0; i < gf->n_nodes; i++) { for (int i = 0; i < gf->n_nodes; i++) {
if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
continue; continue;
}
mem += tensor_op_size(gf->nodes[i]); mem += tensor_op_size(gf->nodes[i]);
} }
@ -486,17 +588,22 @@ struct test_get_rows : public test_case {
const int n; // cols const int n; // cols
const int m; // rows const int m; // rows
const int r; // rows to get const int r; // rows to get
const int b; // batch size
const bool v; // view (non-contiguous src1)
std::string vars() override { std::string vars() override {
return VARS_TO_STR4(type, n, m, r); return VARS_TO_STR6(type, n, m, r, b, v);
} }
test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3) test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false)
: type(type), n(n), m(m), r(r) {} : type(type), n(n), m(m), r(r), b(b), v(v) {}
ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * in = ggml_new_tensor_2d(ctx, type, n, m); ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b);
ggml_tensor * rows = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, r); ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
if (v) {
rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
}
ggml_tensor * out = ggml_get_rows(ctx, in, rows); ggml_tensor * out = ggml_get_rows(ctx, in, rows);
return out; return out;
} }
@ -504,12 +611,13 @@ struct test_get_rows : public test_case {
void initialize_tensors(ggml_context * ctx) override { void initialize_tensors(ggml_context * ctx) override {
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->type == GGML_TYPE_I32) { if (t->type == GGML_TYPE_I32) {
if (ggml_is_view_op(t->op)) { continue; }
// rows // rows
std::vector<int> data(r); std::vector<int> data(r*b);
for (int i = 0; i < r; i++) { for (int i = 0; i < r*b; i++) {
data[i] = rand() % m; data[i] = rand() % m;
} }
ggml_backend_tensor_set(t, data.data(), 0, r * sizeof(int)); ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int));
} else { } else {
init_tensor_uniform(t); init_tensor_uniform(t);
} }
@ -770,11 +878,10 @@ struct test_mul_mat_id : public test_case {
const int64_t m; const int64_t m;
const int64_t n; const int64_t n;
const int64_t k; const int64_t k;
const std::array<int64_t, 2> bs; // dims 3 and 4 const bool v; // view (non-contiguous ids)
const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
std::string vars() override { std::string vars() override {
return VARS_TO_STR9(type_a, type_b, n_mats, id, m, n, k, bs, nr); return VARS_TO_STR8(type_a, type_b, n_mats, id, m, n, k, v);
} }
double max_nmse_err() override { double max_nmse_err() override {
@ -782,7 +889,7 @@ struct test_mul_mat_id : public test_case {
} }
size_t op_size(ggml_tensor * t) override { size_t op_size(ggml_tensor * t) override {
size_t a = ggml_nbytes(t->src[2]) * n * nr[0] * nr[1]; size_t a = ggml_nbytes(t->src[2]) * n;
size_t b = ggml_nbytes(t->src[1]) * m; size_t b = ggml_nbytes(t->src[1]) * m;
size_t c = ggml_nbytes(t); size_t c = ggml_nbytes(t);
return a + b + c; return a + b + c;
@ -792,35 +899,41 @@ struct test_mul_mat_id : public test_case {
test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32, test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
int n_mats = 2, int id = 0, int n_mats = 2, int id = 0,
int64_t m = 32, int64_t n = 32, int64_t k = 32, int64_t m = 32, int64_t n = 32, int64_t k = 32, bool v = false)
std::array<int64_t, 2> bs = {10, 10},
std::array<int64_t, 2> nr = {2, 2})
: type_a(type_a), type_b(type_b), n_mats(n_mats), id(id), : type_a(type_a), type_b(type_b), n_mats(n_mats), id(id),
m(m), n(n), k(k), bs(bs), nr(nr) {} m(m), n(n), k(k), v(v) {}
ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * build_graph(ggml_context * ctx) override {
// C^T = A * B^T: (k, m) * (k, n) => (m, n) // C^T = A * B^T: (k, m) * (k, n) => (m, n)
std::vector<ggml_tensor *> mats; std::vector<ggml_tensor *> mats;
for (int i = 0; i < n_mats; i++) { for (int i = 0; i < n_mats; i++) {
ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]); ggml_tensor * a = ggml_new_tensor_2d(ctx, type_a, k, m);
mats.push_back(a); mats.push_back(a);
} }
ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_mats); ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]); if (v) {
ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), ids, id, b); ids = ggml_view_2d(ctx, ids, n_mats/2, ids->ne[1], ids->nb[1], 0);
}
ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n);
ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), n_mats, ids, v ? id/2 : id, b);
return out; return out;
} }
void initialize_tensors(ggml_context * ctx) override { void initialize_tensors(ggml_context * ctx) override {
std::random_device rd;
std::default_random_engine rng(rd());
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->type == GGML_TYPE_I32) { if (t->type == GGML_TYPE_I32) {
if (ggml_is_view_op(t->op)) { continue; }
// ids // ids
std::vector<int> data(n_mats); for (int64_t r = 0; r < ggml_nrows(t); r++) {
for (int i = 0; i < n_mats; i++) { std::vector<int32_t> data(t->ne[0]);
data[i] = i; for (int i = 0; i < t->ne[0]; i++) {
data[i] = i % n_mats;
}
std::shuffle(data.begin(), data.end(), rng);
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
} }
std::shuffle(data.begin(), data.end(), std::default_random_engine(std::random_device()()));
ggml_backend_tensor_set(t, data.data(), 0, n_mats * sizeof(int));
} else { } else {
init_tensor_uniform(t); init_tensor_uniform(t);
} }
@ -1109,22 +1222,227 @@ struct test_sum_rows : public test_case {
} }
}; };
enum test_mode { // GGML_OP_UPSCALE
MODE_TEST, struct test_upscale : public test_case {
MODE_PERF, const ggml_type type;
const std::array<int64_t, 4> ne;
const int32_t scale_factor;
std::string vars() override {
return VARS_TO_STR3(type, ne, scale_factor);
}
test_upscale(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {512, 512, 3, 1},
int32_t scale_factor = 2)
: type(type), ne(ne), scale_factor(scale_factor) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
return out;
}
};
// GGML_OP_GROUP_NORM
struct test_group_norm : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
const int32_t num_groups;
std::string vars() override {
return VARS_TO_STR3(type, ne, num_groups);
}
test_group_norm(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {64, 64, 320, 1},
int32_t num_groups = 32)
: type(type), ne(ne), num_groups(num_groups) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_tensor * out = ggml_group_norm(ctx, a, num_groups);
return out;
}
};
// GGML_OP_ACC
struct test_acc : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne_a;
const std::array<int64_t, 4> ne_b;
std::string vars() override {
return VARS_TO_STR3(type, ne_a, ne_b);
}
test_acc(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne_a = {1024, 577, 1, 1},
std::array<int64_t, 4> ne_b = {1024, 576, 1, 1})
: type(type), ne_a(ne_a), ne_b(ne_b) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
return out;
}
};
// GGML_OP_PAD
struct test_pad : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne_a;
const int pad_0;
const int pad_1;
std::string vars() override {
return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
}
test_pad(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
int pad_0 = 1, int pad_1 = 1)
: type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
return out;
}
};
// GGML_OP_LEAKY_RELU
struct test_leaky_relu : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne_a;
const float negative_slope;
std::string vars() override {
return VARS_TO_STR3(type, ne_a, negative_slope);
}
test_leaky_relu(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
float negative_slope = 0.1f)
: type(type), ne_a(ne_a), negative_slope(negative_slope) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
ggml_tensor * out = ggml_leaky_relu(ctx, a, negative_slope, true);
return out;
}
};
// Mixtral MOE
struct test_moe : public test_case {
const int n_experts;
const int n_experts_per_tok;
const int n_tokens;
const int n_embd;
const int n_ff;
std::string op_desc(ggml_tensor * t) override {
return "MOE";
GGML_UNUSED(t);
}
std::string vars() override {
return VARS_TO_STR5(n_experts, n_experts_per_tok, n_tokens, n_embd, n_ff);
}
test_moe(int n_experts = 8, int n_experts_per_tok = 2, int n_tokens = 1, int n_embd = 4096, int n_ff = 14336)
: n_experts(n_experts), n_experts_per_tok(n_experts_per_tok), n_tokens(n_tokens), n_embd(n_embd), n_ff(n_ff) {
}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * ffn_gate_inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_experts);
std::vector<ggml_tensor *> ffn_up_exp(n_experts);
std::vector<ggml_tensor *> ffn_gate_exp(n_experts);
std::vector<ggml_tensor *> ffn_down_exp(n_experts);
for (int i = 0; i < n_experts; ++i) {
ffn_up_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
ffn_gate_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
ffn_down_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
}
ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur);
ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, 1.0f/sqrtf(n_embd));
// select experts
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok);
ggml_tensor * weights = ggml_get_rows(ctx,
ggml_reshape_3d(ctx, probs, 1, n_experts, n_tokens), selected_experts);
weights = ggml_reshape_2d(ctx, weights, n_experts_per_tok, n_tokens);
ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights);
weights = ggml_div(ctx, weights, weights_sum);
// compute expert outputs
ggml_tensor * moe_out = nullptr;
for (int i = 0; i < n_experts_per_tok; ++i) {
ggml_tensor * cur_expert;
ggml_tensor * cur_up = ggml_mul_mat_id(ctx, ffn_up_exp.data(), n_experts, selected_experts, i, cur);
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx, ffn_gate_exp.data(), n_experts, selected_experts, i, cur);
cur_gate = ggml_silu(ctx, cur_gate);
cur_expert = ggml_mul(ctx, cur_up, cur_gate);
cur_expert = ggml_mul_mat_id(ctx, ffn_down_exp.data(), n_experts, selected_experts, i, cur_expert);
cur_expert = ggml_mul(ctx, cur_expert,
ggml_view_2d(ctx, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
if (i == 0) {
moe_out = cur_expert;
} else {
moe_out = ggml_add(ctx, moe_out, cur_expert);
}
}
cur = moe_out;
return cur;
}
}; };
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) { static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
std::vector<std::unique_ptr<test_case>> test_cases; std::vector<std::unique_ptr<test_case>> test_cases;
const ggml_type all_types[] = {
GGML_TYPE_F32, GGML_TYPE_F16,
GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
GGML_TYPE_Q8_0,
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
GGML_TYPE_Q6_K
};
// unary ops // unary ops
for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) { for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
test_cases.emplace_back(new test_unary((ggml_unary_op) op)); test_cases.emplace_back(new test_unary((ggml_unary_op) op));
} }
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) { test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
test_cases.emplace_back(new test_get_rows(type, 10, 5, 3)); for (ggml_type type : all_types) {
test_cases.emplace_back(new test_get_rows(type, 16, 5, 3)); for (int b : {1, 7}) {
for (bool v : {false, true}) {
test_cases.emplace_back(new test_get_rows(type, 256, 5, 4, b, v));
}
}
} }
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1})); test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
@ -1134,7 +1452,11 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 2})); test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 2}));
test_cases.emplace_back(new test_dup()); test_cases.emplace_back(new test_dup());
test_cases.emplace_back(new test_cpy());
for (ggml_type type : all_types) {
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 10, 10, 1}));
}
test_cases.emplace_back(new test_cont()); test_cases.emplace_back(new test_cont());
auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) { auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
@ -1144,6 +1466,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
}; };
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1}); add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1});
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1});
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1}); add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1});
add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 1, 1}, {1, 1, 1, 1}); add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 1, 1}, {1, 1, 1, 1});
add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 1}, {1, 1, 1, 1}); add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 1}, {1, 1, 1, 1});
@ -1170,8 +1493,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 640, 1}, {32, 32, 1, 1}); add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 640, 1}, {32, 32, 1, 1});
add_test_bin_bcast(GGML_TYPE_F32, {5120, 1, 1, 1}, {1, 256, 1, 1}); add_test_bin_bcast(GGML_TYPE_F32, {5120, 1, 1, 1}, {1, 256, 1, 1});
add_test_bin_bcast(GGML_TYPE_F32, {640, 1, 1, 1}, {1, 1, 1, 1}); add_test_bin_bcast(GGML_TYPE_F32, {640, 1, 1, 1}, {1, 1, 1, 1});
add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1}); //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1});
add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1}); //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1});
test_cases.emplace_back(new test_scale()); test_cases.emplace_back(new test_scale());
@ -1180,16 +1503,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps)); test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
} }
const ggml_type all_types[] = {
GGML_TYPE_F32, GGML_TYPE_F16,
GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
GGML_TYPE_Q8_0,
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
GGML_TYPE_Q6_K
};
for (ggml_type type_a : all_types) { for (ggml_type type_a : all_types) {
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) { for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
// FIXME: CPU crashes on f16xf16 // FIXME: CPU crashes on f16xf16
@ -1213,9 +1526,11 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
for (ggml_type type_a : all_types) { for (ggml_type type_a : all_types) {
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) { for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
for (int n_mats : {1, 2, 4}) { for (int n_mats : {2, 4, 8}) {
for (int id = 0; id < n_mats; id++) { for (int id = 0; id < n_mats; id++) {
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256, {1, 1}, {1, 1})); for (bool v : {false, true}) {
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256, v));
}
} }
} }
} }
@ -1247,10 +1562,22 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_concat()); test_cases.emplace_back(new test_concat());
for (ggml_sort_order order : {GGML_SORT_ASC, GGML_SORT_DESC}) { for (ggml_sort_order order : {GGML_SORT_ASC, GGML_SORT_DESC}) {
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
} }
test_cases.emplace_back(new test_sum_rows()); test_cases.emplace_back(new test_sum_rows());
test_cases.emplace_back(new test_upscale());
test_cases.emplace_back(new test_group_norm());
test_cases.emplace_back(new test_acc());
test_cases.emplace_back(new test_pad());
test_cases.emplace_back(new test_leaky_relu());
#if !defined(__SANITIZE_THREAD__)
// FIXME: these tests use too much memory with thread sanitizer
test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
//test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
#endif
// run tests // run tests
if (mode == MODE_TEST) { if (mode == MODE_TEST) {
@ -1267,14 +1594,17 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
ggml_backend_free(backend_cpu); ggml_backend_free(backend_cpu);
return n_ok == test_cases.size(); return n_ok == test_cases.size();
} else if (mode == MODE_PERF) { }
if (mode == MODE_PERF) {
for (auto & test : test_cases) { for (auto & test : test_cases) {
test->eval_perf(backend, op_name); test->eval_perf(backend, op_name);
} }
return true; return true;
} else {
GGML_ASSERT(false);
} }
GGML_ASSERT(false);
return false;
} }
static void usage(char ** argv) { static void usage(char ** argv) {
@ -1347,11 +1677,12 @@ int main(int argc, char ** argv) {
} }
printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count()); printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count());
if (n_ok != ggml_backend_reg_get_count()) { if (n_ok != ggml_backend_reg_get_count()) {
printf("\033[1;31mFAIL\033[0m\n"); printf("\033[1;31mFAIL\033[0m\n");
return 1; return 1;
} else { }
printf("\033[1;32mOK\033[0m\n"); printf("\033[1;32mOK\033[0m\n");
return 0; return 0;
} }
}

View file

@ -1,4 +1,4 @@
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
#include "ggml.h" #include "ggml.h"
#include <cmath> #include <cmath>

View file

@ -117,7 +117,7 @@ static void usage(char * argv[]) {
printf(" --size SIZE set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE); printf(" --size SIZE set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
printf(" -3 use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE); printf(" -3 use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
printf(" -4 use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE); printf(" -4 use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
printf(" --op OP set test opration as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n"); printf(" --op OP set test operation as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
printf(" quantize_row_q_dot, vec_dot_q (all)\n"); printf(" quantize_row_q_dot, vec_dot_q (all)\n");
printf(" --type TYPE set test type as"); printf(" --type TYPE set test type as");
for (int i = 0; i < GGML_TYPE_COUNT; i++) { for (int i = 0; i < GGML_TYPE_COUNT; i++) {
@ -202,7 +202,7 @@ int main(int argc, char * argv[]) {
} }
int alignment = std::stoi(argv[i]); int alignment = std::stoi(argv[i]);
if (alignment < 0 || alignment > MAX_ALIGNMENT) { if (alignment < 0 || alignment > MAX_ALIGNMENT) {
fprintf(stderr, "error: aligment-offset must be less than %d\n", MAX_ALIGNMENT); fprintf(stderr, "error: alignment-offset must be less than %d\n", MAX_ALIGNMENT);
invalid_param = true; invalid_param = true;
break; break;
} }
@ -286,7 +286,7 @@ int main(int argc, char * argv[]) {
qfns.from_float_reference(test_data1, test_q1, size); qfns.from_float_reference(test_data1, test_q1, size);
return test_q1[0]; return test_q1[0];
}; };
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); size_t quantized_size = ggml_row_size(type, size);
benchmark_function(size, quantized_size, iterations, quantize_fn); benchmark_function(size, quantized_size, iterations, quantize_fn);
} }
printf("\n"); printf("\n");
@ -300,7 +300,7 @@ int main(int argc, char * argv[]) {
qfns.from_float(test_data1, test_q1, size); qfns.from_float(test_data1, test_q1, size);
return test_q1[0]; return test_q1[0];
}; };
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); size_t quantized_size = ggml_row_size(type, size);
benchmark_function(size, quantized_size, iterations, quantize_fn); benchmark_function(size, quantized_size, iterations, quantize_fn);
} }
printf("\n"); printf("\n");
@ -315,7 +315,7 @@ int main(int argc, char * argv[]) {
qfns.to_float(test_q1, test_out, size); qfns.to_float(test_q1, test_out, size);
return test_out[0]; return test_out[0];
}; };
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); size_t quantized_size = ggml_row_size(type, size);
benchmark_function(size, quantized_size, iterations, quantize_fn); benchmark_function(size, quantized_size, iterations, quantize_fn);
} }
printf("\n"); printf("\n");
@ -330,7 +330,7 @@ int main(int argc, char * argv[]) {
vdot.from_float(test_data1, test_q1, size); vdot.from_float(test_data1, test_q1, size);
return test_q1[0]; return test_q1[0];
}; };
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); size_t quantized_size = ggml_row_size(type, size);
benchmark_function(size, quantized_size, iterations, quantize_fn); benchmark_function(size, quantized_size, iterations, quantize_fn);
} }
printf("\n"); printf("\n");
@ -347,7 +347,7 @@ int main(int argc, char * argv[]) {
qfns.vec_dot(size, &result, test_q1, test_q2); qfns.vec_dot(size, &result, test_q1, test_q2);
return result; return result;
}; };
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); size_t quantized_size = ggml_row_size(type, size);
benchmark_function(size, quantized_size, iterations, quantize_fn); benchmark_function(size, quantized_size, iterations, quantize_fn);
} }
printf("\n"); printf("\n");