Merge remote-tracking branch 'origin/master' into vulkan
This commit is contained in:
commit
5fef0d6bc9
44 changed files with 4618 additions and 1090 deletions
|
@ -15,6 +15,9 @@ indent_size = 4
|
|||
[Makefile]
|
||||
indent_style = tab
|
||||
|
||||
[scripts/*.mk]
|
||||
indent_style = tab
|
||||
|
||||
[prompts/*.txt]
|
||||
insert_final_newline = unset
|
||||
|
||||
|
|
133
CMakeLists.txt
133
CMakeLists.txt
|
@ -414,57 +414,102 @@ if (LLAMA_HIPBLAS)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_ALL_WARNINGS)
|
||||
if (NOT MSVC)
|
||||
set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
||||
set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
|
||||
set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
|
||||
set(host_cxx_flags "")
|
||||
function(get_flags CCID CCVER)
|
||||
set(C_FLAGS "")
|
||||
set(CXX_FLAGS "")
|
||||
|
||||
if (CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
|
||||
set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)
|
||||
if (CCID MATCHES "Clang")
|
||||
set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
|
||||
set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
|
||||
|
||||
if (
|
||||
(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
|
||||
(CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3.0)
|
||||
)
|
||||
set(c_flags ${c_flags} -Wdouble-promotion)
|
||||
endif()
|
||||
elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
|
||||
set(c_flags ${c_flags} -Wdouble-promotion)
|
||||
set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)
|
||||
|
||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
|
||||
set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
|
||||
endif()
|
||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
|
||||
set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
|
||||
endif()
|
||||
if (
|
||||
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
|
||||
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
|
||||
)
|
||||
set(C_FLAGS ${C_FLAGS} -Wdouble-promotion)
|
||||
endif()
|
||||
elseif (CCID STREQUAL "GNU")
|
||||
set(C_FLAGS -Wdouble-promotion)
|
||||
set(CXX_FLAGS -Wno-array-bounds)
|
||||
|
||||
if (CCVER VERSION_GREATER_EQUAL 7.1.0)
|
||||
set(CXX_FLAGS ${CXX_FLAGS} -Wno-format-truncation)
|
||||
endif()
|
||||
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
||||
set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
|
||||
endif()
|
||||
else()
|
||||
# todo : msvc
|
||||
endif()
|
||||
|
||||
set(c_flags ${c_flags} ${warning_flags})
|
||||
set(cxx_flags ${cxx_flags} ${warning_flags})
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
|
||||
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
|
||||
"$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
|
||||
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
|
||||
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
if (LLAMA_ALL_WARNINGS)
|
||||
if (NOT MSVC)
|
||||
set(WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
||||
set(C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
|
||||
-Werror=implicit-int -Werror=implicit-function-declaration)
|
||||
set(CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
|
||||
|
||||
set(C_FLAGS ${WARNING_FLAGS} ${C_FLAGS})
|
||||
set(CXX_FLAGS ${WARNING_FLAGS} ${CXX_FLAGS})
|
||||
|
||||
get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
|
||||
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
|
||||
"$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
|
||||
else()
|
||||
# todo : msvc
|
||||
set(C_FLAGS "")
|
||||
set(CXX_FLAGS "")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (NOT MSVC)
|
||||
set(cuda_flags -Wno-pedantic)
|
||||
endif()
|
||||
set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
|
||||
if (LLAMA_CUBLAS)
|
||||
set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
|
||||
if (NOT MSVC)
|
||||
set(CUDA_FLAGS ${CUDA_FLAGS} -Wno-pedantic)
|
||||
endif()
|
||||
|
||||
list(JOIN host_cxx_flags " " cuda_host_flags) # pass host compiler flags as a single argument
|
||||
if (NOT cuda_host_flags STREQUAL "")
|
||||
set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
|
||||
endif()
|
||||
if (LLAMA_ALL_WARNINGS AND NOT MSVC)
|
||||
set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
|
||||
if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
|
||||
set(NVCC_CMD ${NVCC_CMD} -ccbin ${CMAKE_CUDA_HOST_COMPILER})
|
||||
endif()
|
||||
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
|
||||
execute_process(
|
||||
COMMAND ${NVCC_CMD} -Xcompiler --version
|
||||
OUTPUT_VARIABLE CUDA_CCFULLVER
|
||||
ERROR_QUIET
|
||||
)
|
||||
|
||||
if (NOT CUDA_CCFULLVER MATCHES clang)
|
||||
set(CUDA_CCID "GNU")
|
||||
execute_process(
|
||||
COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
|
||||
OUTPUT_VARIABLE CUDA_CCVER
|
||||
ERROR_QUIET
|
||||
)
|
||||
else()
|
||||
if (CUDA_CCFULLVER MATCHES Apple)
|
||||
set(CUDA_CCID "AppleClang")
|
||||
else()
|
||||
set(CUDA_CCID "Clang")
|
||||
endif()
|
||||
string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
|
||||
endif()
|
||||
|
||||
message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
|
||||
|
||||
get_flags(${CUDA_CCID} ${CUDA_CCVER})
|
||||
list(JOIN GF_CXX_FLAGS " " CUDA_CXX_FLAGS) # pass host compiler flags as a single argument
|
||||
if (NOT CUDA_CXX_FLAGS STREQUAL "")
|
||||
set(CUDA_FLAGS ${CUDA_FLAGS} -Xcompiler ${CUDA_CXX_FLAGS})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
|
||||
endif()
|
||||
|
||||
if (WIN32)
|
||||
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
||||
|
@ -488,6 +533,7 @@ endif()
|
|||
execute_process(
|
||||
COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
|
||||
ERROR_VARIABLE output
|
||||
OUTPUT_QUIET
|
||||
)
|
||||
if (output MATCHES "dyld-1015\.7")
|
||||
add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
|
||||
|
@ -610,6 +656,11 @@ else()
|
|||
message(STATUS "Unknown architecture")
|
||||
endif()
|
||||
|
||||
if (MINGW)
|
||||
# Target Windows 8 for PrefetchVirtualMemory
|
||||
add_compile_definitions(_WIN32_WINNT=0x602)
|
||||
endif()
|
||||
|
||||
#
|
||||
# POSIX conformance
|
||||
#
|
||||
|
|
140
Makefile
140
Makefile
|
@ -26,20 +26,6 @@ ifndef UNAME_M
|
|||
UNAME_M := $(shell uname -m)
|
||||
endif
|
||||
|
||||
ifeq '' '$(findstring clang,$(shell $(CC) --version))'
|
||||
CC_IS_GCC=1
|
||||
CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
|
||||
else
|
||||
CC_IS_CLANG=1
|
||||
ifeq '' '$(findstring Apple,$(shell $(CC) --version))'
|
||||
CC_IS_LLVM_CLANG=1
|
||||
else
|
||||
CC_IS_APPLE_CLANG=1
|
||||
endif
|
||||
CC_VER := $(shell $(CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
|
||||
| awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
|
||||
endif
|
||||
|
||||
# Mac OS + Arm can report x86_64
|
||||
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
|
@ -121,12 +107,12 @@ MK_CXXFLAGS = -std=c++11 -fPIC
|
|||
|
||||
# -Ofast tends to produce faster code, but may not be available for some compilers.
|
||||
ifdef LLAMA_FAST
|
||||
MK_CFLAGS += -Ofast
|
||||
MK_HOST_CXXFLAGS += -Ofast
|
||||
MK_CUDA_CXXFLAGS += -O3
|
||||
MK_CFLAGS += -Ofast
|
||||
HOST_CXXFLAGS += -Ofast
|
||||
MK_NVCCFLAGS += -O3
|
||||
else
|
||||
MK_CFLAGS += -O3
|
||||
MK_CXXFLAGS += -O3
|
||||
MK_CFLAGS += -O3
|
||||
MK_CXXFLAGS += -O3
|
||||
endif
|
||||
|
||||
# clock_gettime came in POSIX.1b (1993)
|
||||
|
@ -220,30 +206,6 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
|
|||
-Werror=implicit-function-declaration
|
||||
MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
|
||||
|
||||
ifeq ($(CC_IS_CLANG), 1)
|
||||
# clang options
|
||||
MK_CFLAGS += -Wunreachable-code-break -Wunreachable-code-return
|
||||
MK_HOST_CXXFLAGS += -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
|
||||
|
||||
ifneq '' '$(and $(CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 030800)))'
|
||||
MK_CFLAGS += -Wdouble-promotion
|
||||
endif
|
||||
ifneq '' '$(and $(CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 070300)))'
|
||||
MK_CFLAGS += -Wdouble-promotion
|
||||
endif
|
||||
else
|
||||
# gcc options
|
||||
MK_CFLAGS += -Wdouble-promotion
|
||||
MK_HOST_CXXFLAGS += -Wno-array-bounds
|
||||
|
||||
ifeq ($(shell expr $(CC_VER) \>= 070100), 1)
|
||||
MK_HOST_CXXFLAGS += -Wno-format-truncation
|
||||
endif
|
||||
ifeq ($(shell expr $(CC_VER) \>= 080100), 1)
|
||||
MK_HOST_CXXFLAGS += -Wextra-semi
|
||||
endif
|
||||
endif
|
||||
|
||||
# this version of Apple ld64 is buggy
|
||||
ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
|
||||
MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
|
||||
|
@ -294,8 +256,8 @@ ifndef RISCV
|
|||
|
||||
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
|
||||
# Use all CPU extensions that are available:
|
||||
MK_CFLAGS += -march=native -mtune=native
|
||||
MK_HOST_CXXFLAGS += -march=native -mtune=native
|
||||
MK_CFLAGS += -march=native -mtune=native
|
||||
HOST_CXXFLAGS += -march=native -mtune=native
|
||||
|
||||
# Usage AVX-only
|
||||
#MK_CFLAGS += -mfma -mf16c -mavx
|
||||
|
@ -306,12 +268,15 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
|
|||
#MK_CXXFLAGS += -mssse3
|
||||
endif
|
||||
|
||||
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
|
||||
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
|
||||
# https://github.com/ggerganov/llama.cpp/issues/2922
|
||||
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
|
||||
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
|
||||
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
|
||||
# https://github.com/ggerganov/llama.cpp/issues/2922
|
||||
MK_CFLAGS += -Xassembler -muse-unaligned-vector-move
|
||||
MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
|
||||
|
||||
# Target Windows 8 for PrefetchVirtualMemory
|
||||
MK_CPPFLAGS += -D_WIN32_WINNT=0x602
|
||||
endif
|
||||
|
||||
ifneq ($(filter aarch64%,$(UNAME_M)),)
|
||||
|
@ -395,61 +360,64 @@ ifdef LLAMA_CUBLAS
|
|||
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
||||
OBJS += ggml-cuda.o
|
||||
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
|
||||
MK_NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
|
||||
|
||||
ifdef LLAMA_DEBUG
|
||||
MK_NVCCFLAGS += -lineinfo
|
||||
endif
|
||||
|
||||
ifdef LLAMA_CUDA_NVCC
|
||||
NVCC = $(LLAMA_CUDA_NVCC)
|
||||
else
|
||||
NVCC = nvcc
|
||||
endif #LLAMA_CUDA_NVCC
|
||||
ifdef CUDA_DOCKER_ARCH
|
||||
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
|
||||
else ifdef CUDA_POWER_ARCH
|
||||
NVCCFLAGS +=
|
||||
else
|
||||
NVCCFLAGS += -arch=native
|
||||
MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
|
||||
else ifndef CUDA_POWER_ARCH
|
||||
MK_NVCCFLAGS += -arch=native
|
||||
endif # CUDA_DOCKER_ARCH
|
||||
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||
endif # LLAMA_CUDA_FORCE_DMMV
|
||||
ifdef LLAMA_CUDA_FORCE_MMQ
|
||||
NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
|
||||
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
|
||||
endif # LLAMA_CUDA_FORCE_MMQ
|
||||
ifdef LLAMA_CUDA_DMMV_X
|
||||
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||
MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||
else
|
||||
NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
|
||||
MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
|
||||
endif # LLAMA_CUDA_DMMV_X
|
||||
ifdef LLAMA_CUDA_MMV_Y
|
||||
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
||||
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
||||
else ifdef LLAMA_CUDA_DMMV_Y
|
||||
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
|
||||
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
|
||||
else
|
||||
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
|
||||
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
|
||||
endif # LLAMA_CUDA_MMV_Y
|
||||
ifdef LLAMA_CUDA_F16
|
||||
NVCCFLAGS += -DGGML_CUDA_F16
|
||||
MK_NVCCFLAGS += -DGGML_CUDA_F16
|
||||
endif # LLAMA_CUDA_F16
|
||||
ifdef LLAMA_CUDA_DMMV_F16
|
||||
NVCCFLAGS += -DGGML_CUDA_F16
|
||||
MK_NVCCFLAGS += -DGGML_CUDA_F16
|
||||
endif # LLAMA_CUDA_DMMV_F16
|
||||
ifdef LLAMA_CUDA_KQUANTS_ITER
|
||||
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
||||
MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
||||
else
|
||||
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
||||
MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
||||
endif
|
||||
ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
||||
NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
|
||||
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
|
||||
else
|
||||
NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
|
||||
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
|
||||
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
||||
#ifdef LLAMA_CUDA_CUBLAS
|
||||
# NVCCFLAGS += -DGGML_CUDA_CUBLAS
|
||||
# MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
|
||||
#endif # LLAMA_CUDA_CUBLAS
|
||||
ifdef LLAMA_CUDA_CCBIN
|
||||
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
||||
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
||||
endif
|
||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||
$(NVCC) $(NVCCFLAGS) -c $< -o $@
|
||||
$(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
||||
endif # LLAMA_CUBLAS
|
||||
|
||||
ifdef LLAMA_CLBLAST
|
||||
|
@ -526,16 +494,22 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
|||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif # LLAMA_MPI
|
||||
|
||||
# combine build flags with cmdline overrides
|
||||
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
|
||||
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
|
||||
override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS) $(CUDA_CXXFLAGS)
|
||||
override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS) $(HOST_CXXFLAGS)
|
||||
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
||||
GF_CC := $(CC)
|
||||
include scripts/get-flags.mk
|
||||
|
||||
# save CXXFLAGS before we add host-only options
|
||||
NVCCFLAGS := $(NVCCFLAGS) $(CXXFLAGS) $(CUDA_CXXFLAGS) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS)"
|
||||
override CXXFLAGS += $(HOST_CXXFLAGS)
|
||||
# combine build flags with cmdline overrides
|
||||
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
|
||||
BASE_CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
|
||||
override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS)
|
||||
override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
|
||||
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
||||
|
||||
# identify CUDA host compiler
|
||||
ifdef LLAMA_CUBLAS
|
||||
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
|
||||
include scripts/get-flags.mk
|
||||
CUDA_CXXFLAGS := $(GF_CXXFLAGS)
|
||||
endif
|
||||
|
||||
#
|
||||
# Print build information
|
||||
|
@ -745,16 +719,16 @@ tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
|
|||
tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
|
||||
|
|
12
README.md
12
README.md
|
@ -10,6 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
|||
|
||||
### Hot topics
|
||||
|
||||
- Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
|
||||
- **llama.h API change for handling KV cache offloading and data type: https://github.com/ggerganov/llama.cpp/pull/4309**
|
||||
- Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225
|
||||
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
|
||||
|
@ -96,7 +97,18 @@ as the main playground for developing new features for the [ggml](https://github
|
|||
- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
|
||||
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
|
||||
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
|
||||
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
|
||||
- [X] [StableLM-3b-4e1t](https://github.com/ggerganov/llama.cpp/pull/3586)
|
||||
- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
|
||||
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
|
||||
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
|
||||
|
||||
**Multimodal models:**
|
||||
|
||||
- [x] [Llava 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e)
|
||||
- [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
|
||||
- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
|
||||
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
|
||||
|
||||
|
||||
**Bindings:**
|
||||
|
|
|
@ -656,6 +656,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
} else if (arg == "-h" || arg == "--help") {
|
||||
return false;
|
||||
|
||||
} else if (arg == "--version") {
|
||||
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
||||
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
||||
exit(0);
|
||||
} else if (arg == "--random-prompt") {
|
||||
params.random_prompt = true;
|
||||
} else if (arg == "--in-prefix-bos") {
|
||||
|
@ -794,6 +798,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf("\n");
|
||||
printf("options:\n");
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" --version show version and build info\n");
|
||||
printf(" -i, --interactive run in interactive mode\n");
|
||||
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
||||
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||
|
|
|
@ -61,13 +61,13 @@
|
|||
// #define LOG_TARGET stderr
|
||||
// #include "log.h"
|
||||
//
|
||||
// The log target can also be redirected to a diffrent function
|
||||
// The log target can also be redirected to a different function
|
||||
// like so:
|
||||
//
|
||||
// #define LOG_TARGET log_handler_diffrent()
|
||||
// #define LOG_TARGET log_handler_different()
|
||||
// #include "log.h"
|
||||
//
|
||||
// FILE* log_handler_diffrent()
|
||||
// FILE* log_handler_different()
|
||||
// {
|
||||
// return stderr;
|
||||
// }
|
||||
|
@ -421,7 +421,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriS
|
|||
|
||||
// Disables logs entirely at runtime.
|
||||
// Makes LOG() and LOG_TEE() produce no output,
|
||||
// untill enabled back.
|
||||
// until enabled back.
|
||||
#define log_disable() log_disable_impl()
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
|
|
|
@ -71,7 +71,7 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd)
|
|||
|
||||
struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
|
||||
float scale = 1.0f; // xavier
|
||||
switch (tensor->n_dims) {
|
||||
switch (ggml_n_dims(tensor)) {
|
||||
case 1:
|
||||
scale /= sqrtf((float) tensor->ne[0]);
|
||||
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
||||
|
@ -119,7 +119,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct
|
|||
}
|
||||
|
||||
struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
|
||||
switch (tensor->n_dims) {
|
||||
switch (ggml_n_dims(tensor)) {
|
||||
case 1:
|
||||
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
||||
float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
|
||||
|
@ -183,25 +183,27 @@ float fclamp(const float v, const float min, const float max) {
|
|||
}
|
||||
|
||||
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
|
||||
GGML_ASSERT(tensor->n_dims == 1);
|
||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||
GGML_ASSERT(tensor->ne[1] == 1);
|
||||
GGML_ASSERT(tensor->ne[2] == 1);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
}
|
||||
|
||||
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
|
||||
GGML_ASSERT(tensor->n_dims == 2);
|
||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||
GGML_ASSERT(tensor->ne[2] == 1);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
}
|
||||
|
||||
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
|
||||
GGML_ASSERT(tensor->n_dims == 3);
|
||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||
GGML_ASSERT(tensor->ne[2] == ne2);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
}
|
||||
|
||||
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
||||
GGML_ASSERT(tensor->n_dims == 4);
|
||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||
GGML_ASSERT(tensor->ne[2] == ne2);
|
||||
|
@ -225,8 +227,8 @@ int64_t get_example_targets_batch(
|
|||
bool sample_random_offsets
|
||||
) {
|
||||
GGML_ASSERT(samples_count > 0);
|
||||
GGML_ASSERT(tokens_input->n_dims == 2);
|
||||
GGML_ASSERT(target_probs->n_dims == 3);
|
||||
GGML_ASSERT(ggml_is_matrix(tokens_input));
|
||||
GGML_ASSERT(ggml_is_3d(target_probs));
|
||||
int64_t n_vocab = target_probs->ne[0];
|
||||
int64_t n_tokens = tokens_input->ne[0];
|
||||
int64_t n_batch = tokens_input->ne[1];
|
||||
|
|
|
@ -77,8 +77,18 @@ class Model:
|
|||
self.gguf_writer.add_embedding_length(n_embd)
|
||||
if (n_ff := self.hparams.get("intermediate_size")) is not None:
|
||||
self.gguf_writer.add_feed_forward_length(n_ff)
|
||||
if (n_head := self.hparams.get("num_attention_head")) is not None:
|
||||
if (n_head := self.hparams.get("num_attention_heads")) is not None:
|
||||
self.gguf_writer.add_head_count(n_head)
|
||||
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
|
||||
self.gguf_writer.add_head_count_kv(n_head_kv)
|
||||
|
||||
if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
|
||||
self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps)
|
||||
if (n_experts := self.hparams.get("num_local_experts")) is not None:
|
||||
self.gguf_writer.add_expert_count(n_experts)
|
||||
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
|
||||
self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||
|
||||
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
|
||||
|
||||
def write_tensors(self):
|
||||
|
@ -170,6 +180,8 @@ class Model:
|
|||
return StableLMModel
|
||||
if model_architecture == "QWenLMHeadModel":
|
||||
return QwenModel
|
||||
if model_architecture == "MixtralForCausalLM":
|
||||
return MixtralModel
|
||||
return Model
|
||||
|
||||
def _is_model_safetensors(self) -> bool:
|
||||
|
@ -207,6 +219,8 @@ class Model:
|
|||
return gguf.MODEL_ARCH.STABLELM
|
||||
if arch == "QWenLMHeadModel":
|
||||
return gguf.MODEL_ARCH.QWEN
|
||||
if arch == "MixtralForCausalLM":
|
||||
return gguf.MODEL_ARCH.LLAMA
|
||||
|
||||
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
||||
|
||||
|
@ -837,6 +851,11 @@ class StableLMModel(Model):
|
|||
self.gguf_writer.add_layer_norm_eps(1e-5)
|
||||
|
||||
|
||||
class MixtralModel(Model):
|
||||
def set_vocab(self):
|
||||
self._set_vocab_sentencepiece()
|
||||
|
||||
|
||||
class QwenModel(Model):
|
||||
@staticmethod
|
||||
def token_bytes_to_string(b):
|
||||
|
|
|
@ -3,7 +3,6 @@ from __future__ import annotations
|
|||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import struct
|
||||
import sys
|
||||
from typing import Any, BinaryIO, Sequence
|
||||
|
@ -11,43 +10,15 @@ from typing import Any, BinaryIO, Sequence
|
|||
import numpy as np
|
||||
import torch
|
||||
|
||||
from pathlib import Path
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
|
||||
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
|
||||
|
||||
|
||||
HF_SUBLAYER_TO_GGML = {
|
||||
"self_attn.q_proj": "attn_q",
|
||||
"self_attn.k_proj": "attn_k",
|
||||
"self_attn.v_proj": "attn_v",
|
||||
"self_attn.o_proj": "attn_output",
|
||||
"mlp.gate_proj": "ffn_gate",
|
||||
"mlp.down_proj": "ffn_down",
|
||||
"mlp.up_proj": "ffn_up",
|
||||
"input_layernorm": "attn_norm",
|
||||
"post_attention_layernorm": "ffn_norm",
|
||||
}
|
||||
|
||||
|
||||
def translate_tensor_name(t: str) -> str:
|
||||
match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
|
||||
if match:
|
||||
nn = match.group(1)
|
||||
sub_layer = match.group(2)
|
||||
lora_type = match.group(3)
|
||||
|
||||
sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
|
||||
if sub_layer_renamed is None:
|
||||
print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
|
||||
sys.exit(1)
|
||||
|
||||
output_string = (
|
||||
f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
|
||||
)
|
||||
return output_string
|
||||
else:
|
||||
print(f"Error: unrecognized tensor {t}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
|
||||
fout.write(b"ggla"[::-1]) # magic (ggml lora)
|
||||
fout.write(struct.pack("i", 1)) # file version
|
||||
|
@ -61,9 +32,7 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
|
|||
fout.write(struct.pack("i", int(params["lora_alpha"])))
|
||||
|
||||
|
||||
def write_tensor_header(
|
||||
self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
|
||||
) -> None:
|
||||
def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
|
||||
sname = name.encode("utf-8")
|
||||
fout.write(
|
||||
struct.pack(
|
||||
|
@ -78,11 +47,12 @@ def write_tensor_header(
|
|||
fout.seek((fout.tell() + 31) & -32)
|
||||
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
print(f"Usage: python {sys.argv[0]} <path>")
|
||||
if len(sys.argv) < 2:
|
||||
print(f"Usage: python {sys.argv[0]} <path> [arch]")
|
||||
print(
|
||||
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
|
||||
)
|
||||
print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
|
||||
sys.exit(1)
|
||||
|
||||
input_json = os.path.join(sys.argv[1], "adapter_config.json")
|
||||
|
@ -90,6 +60,14 @@ input_model = os.path.join(sys.argv[1], "adapter_model.bin")
|
|||
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
|
||||
|
||||
model = torch.load(input_model, map_location="cpu")
|
||||
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
|
||||
|
||||
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
|
||||
print(f"Error: unsupported architecture {arch_name}")
|
||||
sys.exit(1)
|
||||
|
||||
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
|
||||
name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
|
||||
|
||||
with open(input_json, "r") as f:
|
||||
params = json.load(f)
|
||||
|
@ -117,6 +95,7 @@ with open(output_path, "wb") as fout:
|
|||
|
||||
write_file_header(fout, params)
|
||||
for k, v in model.items():
|
||||
orig_k = k
|
||||
if k.endswith(".default.weight"):
|
||||
k = k.replace(".default.weight", ".weight")
|
||||
if k in ["llama_proj.weight", "llama_proj.bias"]:
|
||||
|
@ -129,7 +108,32 @@ with open(output_path, "wb") as fout:
|
|||
v = v.float()
|
||||
|
||||
t = v.detach().numpy()
|
||||
tname = translate_tensor_name(k)
|
||||
|
||||
prefix = "base_model.model."
|
||||
if k.startswith(prefix):
|
||||
k = k[len(prefix) :]
|
||||
|
||||
lora_suffixes = (".lora_A.weight", ".lora_B.weight")
|
||||
if k.endswith(lora_suffixes):
|
||||
suffix = k[-len(lora_suffixes[0]):]
|
||||
k = k[: -len(lora_suffixes[0])]
|
||||
else:
|
||||
print(f"Error: unrecognized tensor name {orig_k}")
|
||||
sys.exit(1)
|
||||
|
||||
tname = name_map.get_name(k)
|
||||
if tname is None:
|
||||
print(f"Error: could not map tensor name {orig_k}")
|
||||
print(" Note: the arch parameter must be specified if the model is not llama")
|
||||
sys.exit(1)
|
||||
|
||||
if suffix == ".lora_A.weight":
|
||||
tname += ".weight.loraA"
|
||||
elif suffix == ".lora_B.weight":
|
||||
tname += ".weight.loraB"
|
||||
else:
|
||||
assert False
|
||||
|
||||
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
|
||||
write_tensor_header(fout, tname, t.shape, t.dtype)
|
||||
t.tofile(fout)
|
||||
|
|
401
convert.py
401
convert.py
|
@ -10,6 +10,7 @@ import itertools
|
|||
import json
|
||||
import math
|
||||
import mmap
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
import signal
|
||||
|
@ -18,15 +19,15 @@ import sys
|
|||
import time
|
||||
import zipfile
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from collections import OrderedDict
|
||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
|
||||
from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, Optional, TypeVar, cast
|
||||
|
||||
import numpy as np
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
import os
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||
import gguf
|
||||
|
@ -42,6 +43,7 @@ NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
|||
ARCH = gguf.MODEL_ARCH.LLAMA
|
||||
|
||||
DEFAULT_CONCURRENCY = 8
|
||||
|
||||
#
|
||||
# data types
|
||||
#
|
||||
|
@ -62,10 +64,10 @@ class UnquantizedDataType(DataType):
|
|||
pass
|
||||
|
||||
|
||||
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
|
||||
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
|
||||
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
|
||||
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
|
||||
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
|
||||
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
|
||||
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
|
||||
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
|
@ -151,14 +153,16 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
|||
|
||||
@dataclass
|
||||
class Params:
|
||||
n_vocab: int
|
||||
n_embd: int
|
||||
n_layer: int
|
||||
n_ctx: int
|
||||
n_ff: int
|
||||
n_head: int
|
||||
n_head_kv: int
|
||||
f_norm_eps: float
|
||||
n_vocab: int
|
||||
n_embd: int
|
||||
n_layer: int
|
||||
n_ctx: int
|
||||
n_ff: int
|
||||
n_head: int
|
||||
n_head_kv: int
|
||||
n_experts: int | None = None
|
||||
n_experts_used: int | None = None
|
||||
f_norm_eps: float | None = None
|
||||
|
||||
rope_scaling_type: gguf.RopeScalingType | None = None
|
||||
f_rope_freq_base: float | None = None
|
||||
|
@ -233,6 +237,13 @@ class Params:
|
|||
raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
|
||||
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
||||
|
||||
n_experts = None
|
||||
n_experts_used = None
|
||||
|
||||
if "num_local_experts" in config:
|
||||
n_experts = config["num_local_experts"]
|
||||
n_experts_used = config["num_experts_per_tok"]
|
||||
|
||||
return Params(
|
||||
n_vocab = config["vocab_size"],
|
||||
n_embd = config["hidden_size"],
|
||||
|
@ -241,6 +252,8 @@ class Params:
|
|||
n_ff = config["intermediate_size"],
|
||||
n_head = (n_head := config["num_attention_heads"]),
|
||||
n_head_kv = config.get("num_key_value_heads", n_head),
|
||||
n_experts = n_experts,
|
||||
n_experts_used = n_experts_used,
|
||||
f_norm_eps = config["rms_norm_eps"],
|
||||
f_rope_freq_base = config.get("rope_theta"),
|
||||
rope_scaling_type = rope_scaling_type,
|
||||
|
@ -255,8 +268,15 @@ class Params:
|
|||
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
|
||||
config = json.load(open(config_path))
|
||||
|
||||
n_experts = None
|
||||
n_experts_used = None
|
||||
f_rope_freq_base = None
|
||||
|
||||
# hack to determine LLaMA v1 vs v2 vs CodeLlama
|
||||
if config.get("rope_theta") == 1000000:
|
||||
if config.get("moe"):
|
||||
# Mixtral
|
||||
n_ctx = 32768
|
||||
elif config.get("rope_theta") == 1000000:
|
||||
# CodeLlama
|
||||
n_ctx = 16384
|
||||
elif config["norm_eps"] == 1e-05:
|
||||
|
@ -266,16 +286,27 @@ class Params:
|
|||
# LLaMA v1
|
||||
n_ctx = 2048
|
||||
|
||||
if "layers.0.feed_forward.w1.weight" in model:
|
||||
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
|
||||
|
||||
if config.get("moe"):
|
||||
n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
|
||||
n_experts = config["moe"]["num_experts"]
|
||||
n_experts_used = config["moe"]["num_experts_per_tok"]
|
||||
f_rope_freq_base = 1e6
|
||||
|
||||
return Params(
|
||||
n_vocab = model["tok_embeddings.weight"].shape[0],
|
||||
n_embd = config["dim"],
|
||||
n_layer = config["n_layers"],
|
||||
n_ctx = n_ctx,
|
||||
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0],
|
||||
n_ff = n_ff,
|
||||
n_head = (n_head := config["n_heads"]),
|
||||
n_head_kv = config.get("n_kv_heads", n_head),
|
||||
n_experts = n_experts,
|
||||
n_experts_used = n_experts_used,
|
||||
f_norm_eps = config["norm_eps"],
|
||||
f_rope_freq_base = config.get("rope_theta"),
|
||||
f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
|
@ -297,127 +328,138 @@ class Params:
|
|||
return params
|
||||
|
||||
|
||||
#
|
||||
# vocab
|
||||
#
|
||||
class VocabLoader:
|
||||
def __init__(self, params: Params, fname_tokenizer: Path) -> None:
|
||||
try:
|
||||
from transformers import AutoTokenizer
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"To use VocabLoader, please install the `transformers` package. "
|
||||
"You can install it with `pip install transformers`."
|
||||
) from e
|
||||
|
||||
class BpeVocab:
|
||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
||||
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
|
||||
added_tokens: dict[str, int]
|
||||
if fname_added_tokens is not None:
|
||||
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
||||
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
||||
try:
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), trust_remote_code=True)
|
||||
except ValueError:
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True)
|
||||
|
||||
self.added_tokens_dict: OrderedDict[str, int] = OrderedDict()
|
||||
|
||||
for tok, tokidx in sorted(self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]):
|
||||
if tokidx >= params.n_vocab or tokidx < self.tokenizer.vocab_size:
|
||||
continue
|
||||
|
||||
self.added_tokens_dict[tok] = tokidx
|
||||
|
||||
self.unk_token_id: int = self.tokenizer.unk_token_id
|
||||
self.specials: dict[str, int] = {
|
||||
tok: self.tokenizer.get_vocab()[tok]
|
||||
for tok in self.tokenizer.all_special_tokens
|
||||
}
|
||||
self.special_ids: set[int] = set(self.tokenizer.all_special_ids)
|
||||
self.vocab_size_base: int = self.tokenizer.vocab_size
|
||||
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict)
|
||||
self.fname_tokenizer: Path = fname_tokenizer
|
||||
|
||||
vocab_file = "tokenizer.model"
|
||||
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
|
||||
if path_candidate is not None:
|
||||
self.spm = SentencePieceProcessor(str(path_candidate))
|
||||
print(self.spm.vocab_size(), self.vocab_size_base)
|
||||
else:
|
||||
# Fall back to trying to find the added tokens in tokenizer.json
|
||||
tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
|
||||
if not tokenizer_json_file.is_file():
|
||||
added_tokens = {}
|
||||
else:
|
||||
tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
|
||||
added_tokens = dict(
|
||||
(item['content'], item['id'])
|
||||
for item in tokenizer_json.get('added_tokens', [])
|
||||
# Added tokens here can be duplicates of the main vocabulary.
|
||||
if item['content'] not in self.bpe_tokenizer)
|
||||
self.spm = None
|
||||
|
||||
vocab_size: int = len(self.bpe_tokenizer)
|
||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||
actual_ids = sorted(added_tokens.values())
|
||||
if expected_ids != actual_ids:
|
||||
expected_end_id = vocab_size + len(actual_ids) - 1
|
||||
raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
|
||||
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
tokenizer = self.tokenizer
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()}
|
||||
added_tokens_ids = set(self.added_tokens_dict.values())
|
||||
|
||||
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
||||
self.added_tokens_list = [text for (text, idx) in items]
|
||||
self.vocab_size_base: int = vocab_size
|
||||
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
||||
self.fname_tokenizer = fname_tokenizer
|
||||
self.fname_added_tokens = fname_added_tokens
|
||||
for i in range(self.vocab_size_base):
|
||||
if i in added_tokens_ids:
|
||||
continue
|
||||
|
||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
tokenizer = self.bpe_tokenizer
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
|
||||
text = reverse_vocab[i].encode("utf-8")
|
||||
yield text, self.get_token_score(i), self.get_token_type(i)
|
||||
|
||||
for i, _ in enumerate(tokenizer):
|
||||
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
|
||||
def get_token_type(self, token_id: int) -> gguf.TokenType:
|
||||
toktype = gguf.TokenType.NORMAL
|
||||
|
||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
for text in self.added_tokens_list:
|
||||
score = -1000.0
|
||||
yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
|
||||
|
||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
yield from self.bpe_tokens()
|
||||
yield from self.added_tokens()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||
|
||||
|
||||
class SentencePieceVocab:
|
||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
||||
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
||||
added_tokens: dict[str, int]
|
||||
if fname_added_tokens is not None:
|
||||
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
||||
else:
|
||||
added_tokens = {}
|
||||
|
||||
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
||||
|
||||
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
||||
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
||||
actual_new_ids = sorted(new_tokens.keys())
|
||||
|
||||
if expected_new_ids != actual_new_ids:
|
||||
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
||||
|
||||
# Token pieces that were added to the base vocabulary.
|
||||
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
||||
self.vocab_size_base = vocab_size
|
||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||
self.fname_tokenizer = fname_tokenizer
|
||||
self.fname_added_tokens = fname_added_tokens
|
||||
|
||||
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
tokenizer = self.sentencepiece_tokenizer
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
text: bytes = piece.encode("utf-8")
|
||||
score: float = tokenizer.get_score(i)
|
||||
|
||||
toktype = gguf.TokenType.NORMAL
|
||||
if tokenizer.is_unknown(i):
|
||||
if self.spm is not None and token_id < self.spm.vocab_size():
|
||||
if self.spm.is_unknown(token_id):
|
||||
toktype = gguf.TokenType.UNKNOWN
|
||||
if tokenizer.is_control(i):
|
||||
if self.spm.is_control(token_id):
|
||||
toktype = gguf.TokenType.CONTROL
|
||||
if self.spm.is_unused(token_id):
|
||||
toktype = gguf.TokenType.UNUSED
|
||||
if self.spm.is_byte(token_id):
|
||||
toktype = gguf.TokenType.BYTE
|
||||
else:
|
||||
if token_id == self.unk_token_id:
|
||||
toktype = gguf.TokenType.UNKNOWN
|
||||
if token_id in self.special_ids:
|
||||
toktype = gguf.TokenType.CONTROL
|
||||
|
||||
# NOTE: I think added_tokens are user defined.
|
||||
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
||||
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
|
||||
return toktype
|
||||
|
||||
if tokenizer.is_unused(i):
|
||||
toktype = gguf.TokenType.UNUSED
|
||||
if tokenizer.is_byte(i):
|
||||
toktype = gguf.TokenType.BYTE
|
||||
|
||||
yield text, score, toktype
|
||||
def get_token_score(self, token_id: int) -> float:
|
||||
if self.spm is not None and token_id < self.spm.vocab_size():
|
||||
return cast(float, self.spm.get_score(token_id))
|
||||
return 0.0
|
||||
|
||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
for text in self.added_tokens_list:
|
||||
score = -1000.0
|
||||
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
|
||||
|
||||
for text in self.added_tokens_dict:
|
||||
if text in self.specials:
|
||||
|
||||
toktype = self.get_token_type(self.specials[text])
|
||||
score = self.get_token_score(self.specials[text])
|
||||
|
||||
else:
|
||||
toktype = gguf.TokenType.USER_DEFINED
|
||||
score = -1000.0
|
||||
|
||||
yield text.encode("utf-8"), score, toktype
|
||||
|
||||
def has_newline_token(self) -> bool:
|
||||
return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab
|
||||
|
||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
yield from self.sentencepiece_tokens()
|
||||
yield from self.hf_tokens()
|
||||
yield from self.added_tokens()
|
||||
|
||||
def get_vocab_type(self) -> str:
|
||||
path_candidates = []
|
||||
vocab_file = "tokenizer.model"
|
||||
path_candidates.append(vocab_file)
|
||||
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
|
||||
if path_candidate is not None:
|
||||
return "llama"
|
||||
|
||||
vocab_file = "vocab.json"
|
||||
path_candidates.append(vocab_file)
|
||||
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
|
||||
if path_candidate is not None:
|
||||
return "gpt2"
|
||||
|
||||
vocab_file = "tokenizer.json"
|
||||
path_candidates.append(vocab_file)
|
||||
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
|
||||
if path_candidate:
|
||||
if not self.has_newline_token():
|
||||
return "gpt2"
|
||||
return "llama"
|
||||
|
||||
raise FileNotFoundError(
|
||||
f"Could not find {path_candidates} in {self.fname_tokenizer} or its parent; "
|
||||
"if it's in another directory, pass the directory as --vocab-dir"
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||
return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_dict)} added tokens>"
|
||||
|
||||
|
||||
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
||||
Vocab: TypeAlias = 'VocabLoader'
|
||||
|
||||
|
||||
#
|
||||
# data loading
|
||||
|
@ -585,7 +627,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
|
|||
|
||||
if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
|
||||
# Transformers models put different tensors in different files, but
|
||||
# don't split indivdual tensors between files.
|
||||
# don't split individual tensors between files.
|
||||
model: LazyModel = {}
|
||||
for mp in models_plus:
|
||||
model.update(mp.model)
|
||||
|
@ -678,7 +720,7 @@ class LazyUnpickler(pickle.Unpickler):
|
|||
return func(*args)
|
||||
|
||||
CLASSES: dict[tuple[str, str], Any] = {
|
||||
# getattr used here as a workaround for mypy not being smart enough to detrmine
|
||||
# getattr used here as a workaround for mypy not being smart enough to determine
|
||||
# the staticmethods have a __func__ attribute.
|
||||
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
|
||||
('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
|
||||
|
@ -794,20 +836,27 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
|
|||
yield result
|
||||
|
||||
|
||||
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
||||
def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
|
||||
if params.n_vocab != vocab.vocab_size:
|
||||
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
|
||||
if params.n_vocab == vocab.vocab_size_base:
|
||||
if params.n_vocab == vocab.vocab_size:
|
||||
print("Ignoring added_tokens.json since model matches vocab size without it.")
|
||||
vocab.added_tokens_list = []
|
||||
vocab.vocab_size = vocab.vocab_size_base
|
||||
vocab.added_tokens_dict = OrderedDict()
|
||||
vocab.vocab_size = vocab.vocab_size
|
||||
return
|
||||
|
||||
if pad_vocab and params.n_vocab > vocab.vocab_size:
|
||||
pad_count = params.n_vocab - vocab.vocab_size
|
||||
print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>')
|
||||
for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
|
||||
vocab.added_tokens_dict[f'<dummy{i:05}>'] = -1
|
||||
vocab.vocab_size = params.n_vocab
|
||||
return
|
||||
msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
|
||||
if vocab.fname_added_tokens is not None:
|
||||
msg += f" combined with {vocab.fname_added_tokens}"
|
||||
msg += f" has {vocab.vocab_size})."
|
||||
if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
|
||||
if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
|
||||
msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
|
||||
if vocab.vocab_size < params.n_vocab:
|
||||
msg += " Possibly try using the --padvocab option."
|
||||
raise Exception(msg)
|
||||
|
||||
|
||||
|
@ -832,7 +881,17 @@ class OutputFile:
|
|||
self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
|
||||
self.gguf.add_head_count (params.n_head)
|
||||
self.gguf.add_head_count_kv (params.n_head_kv)
|
||||
self.gguf.add_layer_norm_rms_eps (params.f_norm_eps)
|
||||
|
||||
if params.n_experts:
|
||||
self.gguf.add_expert_count(params.n_experts)
|
||||
|
||||
if params.n_experts_used:
|
||||
self.gguf.add_expert_used_count(params.n_experts_used)
|
||||
|
||||
if params.f_norm_eps:
|
||||
self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
|
||||
else:
|
||||
raise ValueError('f_norm_eps is None')
|
||||
|
||||
if params.f_rope_freq_base is not None:
|
||||
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
|
||||
|
@ -861,12 +920,8 @@ class OutputFile:
|
|||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
|
||||
if isinstance(vocab, SentencePieceVocab):
|
||||
self.gguf.add_tokenizer_model("llama")
|
||||
elif isinstance(vocab, BpeVocab):
|
||||
self.gguf.add_tokenizer_model("gpt2")
|
||||
else:
|
||||
raise ValueError('Unknown vocab type: Not BpeVocab or SentencePieceVocab')
|
||||
vocab_type = vocab.get_vocab_type()
|
||||
self.gguf.add_tokenizer_model(vocab_type)
|
||||
self.gguf.add_token_list(tokens)
|
||||
self.gguf.add_token_scores(scores)
|
||||
self.gguf.add_token_types(toktypes)
|
||||
|
@ -892,8 +947,12 @@ class OutputFile:
|
|||
self.gguf.close()
|
||||
|
||||
@staticmethod
|
||||
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||
check_vocab_size(params, vocab)
|
||||
def write_vocab_only(
|
||||
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
|
||||
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
||||
pad_vocab: bool = False,
|
||||
) -> None:
|
||||
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
|
||||
|
||||
of = OutputFile(fname_out, endianess=endianess)
|
||||
|
||||
|
@ -920,8 +979,13 @@ class OutputFile:
|
|||
return dt.quantize(arr)
|
||||
|
||||
@staticmethod
|
||||
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||
check_vocab_size(params, vocab)
|
||||
def write_all(
|
||||
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
|
||||
concurrency: int = DEFAULT_CONCURRENCY,
|
||||
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
||||
pad_vocab: bool = False,
|
||||
) -> None:
|
||||
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
|
||||
|
||||
of = OutputFile(fname_out, endianess=endianess)
|
||||
|
||||
|
@ -956,7 +1020,7 @@ class OutputFile:
|
|||
|
||||
|
||||
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
||||
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type
|
||||
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
|
||||
|
||||
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
||||
return GGMLFileType.AllF32
|
||||
|
@ -1079,35 +1143,17 @@ def load_some_model(path: Path) -> ModelPlus:
|
|||
return model_plus
|
||||
|
||||
|
||||
def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
|
||||
# Be extra-friendly and accept either a file or a directory. Also, if it's
|
||||
# a directory, it might be the model directory, and tokenizer.model might
|
||||
# be in the parent of that.
|
||||
if path.is_dir():
|
||||
vocab_file = "tokenizer.model"
|
||||
if vocabtype == 'bpe':
|
||||
vocab_file = "vocab.json"
|
||||
path2 = path / vocab_file
|
||||
# Use `.parent` instead of /.. to handle the symlink case better.
|
||||
path3 = path.parent / vocab_file
|
||||
if path2.exists():
|
||||
path = path2
|
||||
elif path3.exists():
|
||||
path = path3
|
||||
else:
|
||||
raise FileNotFoundError(
|
||||
f"Could not find {vocab_file} in {path} or its parent; "
|
||||
"if it's in another directory, pass the directory as --vocab-dir")
|
||||
def find_vocab_file_path(path: Path, vocab_file: str) -> Optional[Path]:
|
||||
path2 = path / vocab_file
|
||||
# Use `.parent` instead of /.. to handle the symlink case better.
|
||||
path3 = path.parent / vocab_file
|
||||
|
||||
print(f"Loading vocab file '{path}', type '{vocabtype}'")
|
||||
if path2.exists():
|
||||
return path2
|
||||
if path3.exists():
|
||||
return path3
|
||||
|
||||
added_tokens_path = path.parent / "added_tokens.json"
|
||||
if vocabtype == "bpe":
|
||||
return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
|
||||
elif vocabtype == "spm":
|
||||
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
|
||||
else:
|
||||
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
|
||||
return None
|
||||
|
||||
|
||||
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
|
||||
|
@ -1145,11 +1191,11 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
|
||||
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
|
||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin, *.safetensors)")
|
||||
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
|
||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
|
||||
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
|
||||
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
|
||||
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
|
||||
parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
|
||||
|
||||
args = parser.parse_args(args_in)
|
||||
if args.dump_single:
|
||||
|
@ -1192,12 +1238,13 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
if not args.outfile:
|
||||
raise ValueError("need --outfile if using --vocab-only")
|
||||
# FIXME: Try to respect vocab_dir somehow?
|
||||
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
|
||||
vocab = VocabLoader(params, args.vocab_dir or args.model)
|
||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
||||
load_merges = args.vocabtype == 'bpe',
|
||||
load_merges = True,
|
||||
n_vocab = vocab.vocab_size)
|
||||
outfile = args.outfile
|
||||
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
|
||||
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
|
||||
endianess = endianess, pad_vocab = args.padvocab)
|
||||
print(f"Wrote {outfile}")
|
||||
return
|
||||
|
||||
|
@ -1205,12 +1252,15 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
vocab = model_plus.vocab
|
||||
else:
|
||||
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
||||
vocab = load_vocab(vocab_dir, args.vocabtype)
|
||||
vocab = VocabLoader(params, vocab_dir)
|
||||
|
||||
# FIXME: Try to respect vocab_dir somehow?
|
||||
print(f"Vocab info: {vocab}")
|
||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
||||
load_merges = args.vocabtype == 'bpe',
|
||||
load_merges = True,
|
||||
n_vocab = vocab.vocab_size)
|
||||
|
||||
print(f"Special vocab info: {special_vocab}")
|
||||
model = model_plus.model
|
||||
model = convert_model_names(model, params)
|
||||
ftype = pick_output_type(model, args.outtype)
|
||||
|
@ -1220,7 +1270,8 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
params.ftype = ftype
|
||||
print(f"Writing {outfile}, format {ftype}")
|
||||
|
||||
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
|
||||
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
|
||||
concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab)
|
||||
print(f"Wrote {outfile}")
|
||||
|
||||
|
||||
|
|
|
@ -1258,9 +1258,9 @@ static struct ggml_tensor * forward_lora(
|
|||
}
|
||||
|
||||
static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
|
||||
assert(logits->n_dims == 2);
|
||||
assert(probs->n_dims == 2);
|
||||
assert(best_samples->n_dims == 1);
|
||||
assert(ggml_is_matrix(logits));
|
||||
assert(ggml_is_matrix(probs));
|
||||
assert(ggml_is_vector(best_samples));
|
||||
assert(logits->ne[1] == best_samples->ne[0]);
|
||||
assert(logits->ne[0] == probs->ne[0]);
|
||||
assert(logits->ne[1] == probs->ne[1]);
|
||||
|
@ -1292,9 +1292,9 @@ static void sample_softmax_batch(
|
|||
struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
|
||||
struct ggml_tensor * best_samples
|
||||
) {
|
||||
GGML_ASSERT(best_samples->n_dims == 2);
|
||||
GGML_ASSERT(logits->n_dims == 3);
|
||||
GGML_ASSERT(probs->n_dims == 3);
|
||||
GGML_ASSERT(ggml_is_matrix(best_samples));
|
||||
GGML_ASSERT(ggml_is_3d(logits));
|
||||
GGML_ASSERT(ggml_is_3d(probs));
|
||||
int n_tokens = best_samples->ne[0];
|
||||
int n_batch = best_samples->ne[1];
|
||||
int n_vocab = logits->ne[0];
|
||||
|
@ -1334,7 +1334,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
|
|||
}
|
||||
|
||||
static void print_matrix(struct ggml_tensor * probs) {
|
||||
assert(probs->n_dims == 2);
|
||||
assert(ggml_is_matrix(probs));
|
||||
for (int i = 0; i < probs->ne[1]; ++i) {
|
||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||
float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
|
||||
|
@ -1386,8 +1386,8 @@ static void get_example_targets(int example_id, struct ggml_tensor * tokens_inpu
|
|||
static void get_example_targets_batch(
|
||||
struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
|
||||
) {
|
||||
GGML_ASSERT(tokens_input->n_dims == 2);
|
||||
GGML_ASSERT( targets->n_dims == 3);
|
||||
GGML_ASSERT(ggml_is_matrix(tokens_input));
|
||||
GGML_ASSERT(ggml_is_3d(targets));
|
||||
int n_tokens = tokens_input->ne[0];
|
||||
int n_batch = tokens_input->ne[1];
|
||||
GGML_ASSERT(n_tokens == targets->ne[1]);
|
||||
|
|
|
@ -129,13 +129,13 @@ int main(int argc, char ** argv) {
|
|||
const ggml_type qtype = GGML_TYPE_Q4_1;
|
||||
|
||||
size_t ctx_size = 0;
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
|
||||
ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(qtype);
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(qtype);
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
|
||||
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
|
||||
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
|
||||
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
|
||||
ctx_size += ggml_row_size(qtype, sizex*sizey);
|
||||
ctx_size += ggml_row_size(qtype, sizex*sizey);
|
||||
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
|
||||
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
|
||||
ctx_size += 1024*1024*16;
|
||||
|
||||
printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
|
||||
|
|
|
@ -427,7 +427,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
|
|||
}
|
||||
|
||||
static void print_matrix(struct ggml_tensor * probs) {
|
||||
assert(probs->n_dims == 2);
|
||||
assert(ggml_is_matrix(probs));
|
||||
for (int i = 0; i < probs->ne[1]; ++i) {
|
||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||
float p = get_f32_2d(probs, k, i);
|
||||
|
@ -639,7 +639,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
|
|||
|
||||
static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
|
||||
int ct;
|
||||
switch (gg_weights->n_dims){
|
||||
switch (ggml_n_dims(gg_weights)) {
|
||||
case 1:
|
||||
ct = 0;
|
||||
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
|
||||
|
|
|
@ -1110,7 +1110,7 @@ static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor,
|
|||
name = ggml_get_name(tensor);
|
||||
}
|
||||
uint32_t name_len = strlen(name);
|
||||
uint32_t nd = tensor->n_dims;
|
||||
uint32_t nd = ggml_n_dims(tensor);
|
||||
uint32_t ne[4] = { (uint32_t)tensor->ne[0],
|
||||
(uint32_t)tensor->ne[1],
|
||||
(uint32_t)tensor->ne[2],
|
||||
|
|
|
@ -195,7 +195,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
|
|||
|
||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
||||
|
||||
printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
|
||||
printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data);
|
||||
|
||||
// print first 10 elements
|
||||
const float * data = (const float *) cur->data;
|
||||
|
|
|
@ -514,7 +514,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
ctx_size += padded_size;
|
||||
if (verbosity >= 3) {
|
||||
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i,
|
||||
cur->n_dims, cur->name, tensor_size, padded_size, offset);
|
||||
ggml_n_dims(cur), cur->name, tensor_size, padded_size, offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -739,7 +739,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
|||
temp->ny = longer_side;
|
||||
temp->size = 3 * longer_side * longer_side;
|
||||
temp->data = new uint8_t[temp->size]();
|
||||
uint8_t bc[3] = {122, 116, 104}; // bakground color in RGB from LLaVA
|
||||
uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
|
||||
|
||||
// fill with background color
|
||||
for (size_t i = 0; i < temp->size; i++) {
|
||||
|
@ -962,7 +962,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||
}
|
||||
|
||||
// quantize only 2D tensors
|
||||
quantize &= (cur->n_dims == 2);
|
||||
quantize &= (ggml_n_dims(cur) == 2);
|
||||
|
||||
if (quantize) {
|
||||
new_type = type;
|
||||
|
@ -1035,7 +1035,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||
fout.put(0);
|
||||
}
|
||||
|
||||
printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), cur->n_dims, quantize,
|
||||
printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
|
||||
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
|
|
|
@ -51,7 +51,7 @@ def bytes_to_unicode():
|
|||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a signficant percentage of your normal, say, 32K bpe vocab.
|
||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# llama.cpp/examples/lookahead
|
||||
|
||||
Demonstartion of lookahead decoding technique:
|
||||
Demonstration of lookahead decoding technique:
|
||||
|
||||
https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
||||
|
||||
|
|
|
@ -222,7 +222,7 @@ node index.js
|
|||
|
||||
`content`: Set the text to process.
|
||||
|
||||
**POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
|
||||
- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
|
||||
|
||||
*Options:*
|
||||
|
||||
|
|
|
@ -11227,7 +11227,7 @@ class binary_reader
|
|||
}
|
||||
if (is_ndarray) // ndarray dimensional vector can only contain integers, and can not embed another array
|
||||
{
|
||||
return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read, exception_message(input_format, "ndarray dimentional vector is not allowed", "size"), nullptr));
|
||||
return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read, exception_message(input_format, "ndarray dimensional vector is not allowed", "size"), nullptr));
|
||||
}
|
||||
std::vector<size_t> dim;
|
||||
if (JSON_HEDLEY_UNLIKELY(!get_ubjson_ndarray_size(dim)))
|
||||
|
|
|
@ -34,7 +34,8 @@ export async function* llama(prompt, params = {}, config = {}) {
|
|||
headers: {
|
||||
'Connection': 'keep-alive',
|
||||
'Content-Type': 'application/json',
|
||||
'Accept': 'text/event-stream'
|
||||
'Accept': 'text/event-stream',
|
||||
...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
|
||||
},
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
@ -114,7 +115,7 @@ export async function* llama(prompt, params = {}, config = {}) {
|
|||
return content;
|
||||
}
|
||||
|
||||
// Call llama, return an event target that you can subcribe to
|
||||
// Call llama, return an event target that you can subscribe to
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
|
|
|
@ -223,7 +223,7 @@
|
|||
repeat_last_n: 256, // 0 = disable penalty, -1 = context size
|
||||
repeat_penalty: 1.18, // 1.0 = disabled
|
||||
top_k: 40, // <= 0 to use vocab size
|
||||
top_p: 0.5, // 1.0 = disabled
|
||||
top_p: 0.95, // 1.0 = disabled
|
||||
min_p: 0.05, // 0 = disabled
|
||||
tfs_z: 1.0, // 1.0 = disabled
|
||||
typical_p: 1.0, // 1.0 = disabled
|
||||
|
@ -235,10 +235,11 @@
|
|||
grammar: '',
|
||||
n_probs: 0, // no completion_probabilities,
|
||||
image_data: [],
|
||||
cache_prompt: true
|
||||
cache_prompt: true,
|
||||
api_key: ''
|
||||
})
|
||||
|
||||
/* START: Support for storing prompt templates and parameters in borwser LocalStorage */
|
||||
/* START: Support for storing prompt templates and parameters in browsers LocalStorage */
|
||||
|
||||
const local_storage_storageKey = "llamacpp_server_local_storage";
|
||||
|
||||
|
@ -282,7 +283,7 @@
|
|||
let importedTemplates = local_storage_getDataAsObject('user_templates')
|
||||
|
||||
if (importedTemplates) {
|
||||
// saved templates were successfuly imported.
|
||||
// saved templates were successfully imported.
|
||||
|
||||
console.log('Processing saved templates and updating default template')
|
||||
params.value = { ...params.value, image_data: [] };
|
||||
|
@ -303,7 +304,7 @@
|
|||
}
|
||||
|
||||
function userTemplateResetToDefault() {
|
||||
console.log('Reseting themplate to default')
|
||||
console.log('Resetting template to default')
|
||||
selectedUserTemplate.value.name = 'default';
|
||||
selectedUserTemplate.value.data = savedUserTemplates.value['default'];
|
||||
}
|
||||
|
@ -762,7 +763,7 @@
|
|||
|
||||
<fieldset class="two">
|
||||
${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
|
||||
${FloatField({ label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
|
||||
${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
|
||||
${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
|
||||
${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
|
||||
${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
|
||||
|
@ -790,6 +791,10 @@
|
|||
<fieldset>
|
||||
${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
|
||||
</fieldset>
|
||||
<fieldset>
|
||||
<label for="api_key">API Key</label>
|
||||
<input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
|
||||
</fieldset>
|
||||
</details>
|
||||
</form>
|
||||
`
|
||||
|
|
|
@ -36,6 +36,7 @@ using json = nlohmann::json;
|
|||
struct server_params
|
||||
{
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::string api_key;
|
||||
std::string public_path = "examples/server/public";
|
||||
int32_t port = 8080;
|
||||
int32_t read_timeout = 600;
|
||||
|
@ -376,7 +377,6 @@ struct llama_client_slot
|
|||
|
||||
int32_t num_prompt_tokens = 0;
|
||||
int32_t num_prompt_tokens_processed = 0;
|
||||
int32_t multibyte_pending = 0;
|
||||
|
||||
json prompt;
|
||||
std::string generated_text;
|
||||
|
@ -425,7 +425,6 @@ struct llama_client_slot
|
|||
stopped_word = false;
|
||||
stopped_limit = false;
|
||||
stopping_word = "";
|
||||
multibyte_pending = 0;
|
||||
n_past = 0;
|
||||
sent_count = 0;
|
||||
sent_token_probs_index = 0;
|
||||
|
@ -992,35 +991,36 @@ struct llama_server_context
|
|||
slot.generated_text += token_str;
|
||||
slot.has_next_token = true;
|
||||
|
||||
if (slot.multibyte_pending > 0)
|
||||
// check if there is incomplete UTF-8 character at the end
|
||||
bool incomplete = false;
|
||||
for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i)
|
||||
{
|
||||
slot.multibyte_pending -= token_str.size();
|
||||
}
|
||||
else if (token_str.size() == 1)
|
||||
{
|
||||
const char c = token_str[0];
|
||||
// 2-byte characters: 110xxxxx 10xxxxxx
|
||||
unsigned char c = slot.generated_text[slot.generated_text.size() - i];
|
||||
if ((c & 0xC0) == 0x80)
|
||||
{
|
||||
// continuation byte: 10xxxxxx
|
||||
continue;
|
||||
}
|
||||
if ((c & 0xE0) == 0xC0)
|
||||
{
|
||||
slot.multibyte_pending = 1;
|
||||
// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
|
||||
// 2-byte character: 110xxxxx ...
|
||||
incomplete = i < 2;
|
||||
}
|
||||
else if ((c & 0xF0) == 0xE0)
|
||||
{
|
||||
slot.multibyte_pending = 2;
|
||||
// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
// 3-byte character: 1110xxxx ...
|
||||
incomplete = i < 3;
|
||||
}
|
||||
else if ((c & 0xF8) == 0xF0)
|
||||
{
|
||||
slot.multibyte_pending = 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
slot.multibyte_pending = 0;
|
||||
// 4-byte character: 11110xxx ...
|
||||
incomplete = i < 4;
|
||||
}
|
||||
// else 1-byte character or invalid byte
|
||||
break;
|
||||
}
|
||||
|
||||
if (slot.multibyte_pending == 0)
|
||||
if (!incomplete)
|
||||
{
|
||||
size_t pos = std::min(slot.sent_count, slot.generated_text.size());
|
||||
const std::string str_test = slot.generated_text.substr(pos);
|
||||
|
@ -1055,7 +1055,7 @@ struct llama_server_context
|
|||
}
|
||||
}
|
||||
|
||||
if (slot.multibyte_pending > 0 && !slot.has_next_token)
|
||||
if (incomplete)
|
||||
{
|
||||
slot.has_next_token = true;
|
||||
}
|
||||
|
@ -1954,6 +1954,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
||||
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
|
||||
printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
|
||||
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
|
||||
printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
|
||||
printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
|
||||
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
|
||||
|
@ -2003,6 +2004,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
}
|
||||
sparams.public_path = argv[i];
|
||||
}
|
||||
else if (arg == "--api-key")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.api_key = argv[i];
|
||||
}
|
||||
else if (arg == "--timeout" || arg == "-to")
|
||||
{
|
||||
if (++i >= argc)
|
||||
|
@ -2382,6 +2392,7 @@ json oaicompat_completion_params_parse(
|
|||
llama_params["__oaicompat"] = true;
|
||||
|
||||
// Map OpenAI parameters to llama.cpp parameters
|
||||
llama_params["model"] = json_value(body, "model", std::string("uknown"));
|
||||
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
|
||||
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
||||
llama_params["temperature"] = json_value(body, "temperature", 0.8);
|
||||
|
@ -2669,6 +2680,32 @@ int main(int argc, char **argv)
|
|||
|
||||
httplib::Server svr;
|
||||
|
||||
// Middleware for API key validation
|
||||
auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
|
||||
// If API key is not set, skip validation
|
||||
if (sparams.api_key.empty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check for API key in the header
|
||||
auto auth_header = req.get_header_value("Authorization");
|
||||
std::string prefix = "Bearer ";
|
||||
if (auth_header.substr(0, prefix.size()) == prefix) {
|
||||
std::string received_api_key = auth_header.substr(prefix.size());
|
||||
if (received_api_key == sparams.api_key) {
|
||||
return true; // API key is valid
|
||||
}
|
||||
}
|
||||
|
||||
// API key is invalid or not provided
|
||||
res.set_content("Unauthorized: Invalid API Key", "text/plain");
|
||||
res.status = 401; // Unauthorized
|
||||
|
||||
LOG_WARNING("Unauthorized: Invalid API Key", {});
|
||||
|
||||
return false;
|
||||
};
|
||||
|
||||
svr.set_default_headers({{"Server", "llama.cpp"},
|
||||
{"Access-Control-Allow-Origin", "*"},
|
||||
{"Access-Control-Allow-Headers", "content-type"}});
|
||||
|
@ -2711,8 +2748,11 @@ int main(int argc, char **argv)
|
|||
res.set_content(data.dump(), "application/json");
|
||||
});
|
||||
|
||||
svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||
svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
|
||||
{
|
||||
if (!validate_api_key(req, res)) {
|
||||
return;
|
||||
}
|
||||
json data = json::parse(req.body);
|
||||
const int task_id = llama.request_completion(data, false, false, -1);
|
||||
if (!json_value(data, "stream", false)) {
|
||||
|
@ -2799,8 +2839,11 @@ int main(int argc, char **argv)
|
|||
});
|
||||
|
||||
// TODO: add mount point without "/v1" prefix -- how?
|
||||
svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||
svr.Post("/v1/chat/completions", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
|
||||
{
|
||||
if (!validate_api_key(req, res)) {
|
||||
return;
|
||||
}
|
||||
json data = oaicompat_completion_params_parse(json::parse(req.body));
|
||||
|
||||
const int task_id = llama.request_completion(data, false, false, -1);
|
||||
|
@ -2869,8 +2912,11 @@ int main(int argc, char **argv)
|
|||
}
|
||||
});
|
||||
|
||||
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||
svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
|
||||
{
|
||||
if (!validate_api_key(req, res)) {
|
||||
return;
|
||||
}
|
||||
json data = json::parse(req.body);
|
||||
const int task_id = llama.request_completion(data, true, false, -1);
|
||||
if (!json_value(data, "stream", false)) {
|
||||
|
@ -3005,11 +3051,15 @@ int main(int argc, char **argv)
|
|||
|
||||
svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
|
||||
{
|
||||
if (res.status == 401)
|
||||
{
|
||||
res.set_content("Unauthorized", "text/plain");
|
||||
}
|
||||
if (res.status == 400)
|
||||
{
|
||||
res.set_content("Invalid request", "text/plain");
|
||||
}
|
||||
else if (res.status != 500)
|
||||
else if (res.status == 404)
|
||||
{
|
||||
res.set_content("File Not Found", "text/plain");
|
||||
res.status = 404;
|
||||
|
@ -3032,11 +3082,15 @@ int main(int argc, char **argv)
|
|||
// to make it ctrl+clickable:
|
||||
LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
|
||||
|
||||
LOG_INFO("HTTP server listening", {
|
||||
{"hostname", sparams.hostname},
|
||||
{"port", sparams.port},
|
||||
});
|
||||
std::unordered_map<std::string, std::string> log_data;
|
||||
log_data["hostname"] = sparams.hostname;
|
||||
log_data["port"] = std::to_string(sparams.port);
|
||||
|
||||
if (!sparams.api_key.empty()) {
|
||||
log_data["api_key"] = "api_key: ****" + sparams.api_key.substr(sparams.api_key.length() - 4);
|
||||
}
|
||||
|
||||
LOG_INFO("HTTP server listening", log_data);
|
||||
// run the HTTP server in a thread - see comment below
|
||||
std::thread t([&]()
|
||||
{
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# llama.cpp/examples/speculative
|
||||
|
||||
Demonstartion of speculative decoding and tree-based speculative decoding techniques
|
||||
Demonstration of speculative decoding and tree-based speculative decoding techniques
|
||||
|
||||
More info:
|
||||
|
||||
|
|
|
@ -428,7 +428,7 @@ int main(int argc, char ** argv) {
|
|||
++n_past_tgt;
|
||||
}
|
||||
|
||||
// the first token is always proposed by the traget model before the speculation loop so we erase it here
|
||||
// the first token is always proposed by the target model before the speculation loop so we erase it here
|
||||
for (int s = 0; s < n_seq_dft; ++s) {
|
||||
if (!drafts[s].active) {
|
||||
continue;
|
||||
|
|
|
@ -43,7 +43,7 @@ GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph
|
|||
// ggml-backend v2 API
|
||||
//
|
||||
|
||||
// Seperate tensor and graph allocator objects
|
||||
// Separate tensor and graph allocator objects
|
||||
// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
|
||||
// The original API is kept as a wrapper around the new API
|
||||
|
||||
|
|
788
ggml-cuda.cu
788
ggml-cuda.cu
File diff suppressed because it is too large
Load diff
587
ggml-metal.m
587
ggml-metal.m
|
@ -66,9 +66,11 @@ struct ggml_metal_context {
|
|||
GGML_METAL_DECL_KERNEL(div_row);
|
||||
GGML_METAL_DECL_KERNEL(scale);
|
||||
GGML_METAL_DECL_KERNEL(scale_4);
|
||||
GGML_METAL_DECL_KERNEL(silu);
|
||||
GGML_METAL_DECL_KERNEL(tanh);
|
||||
GGML_METAL_DECL_KERNEL(relu);
|
||||
GGML_METAL_DECL_KERNEL(gelu);
|
||||
GGML_METAL_DECL_KERNEL(gelu_quick);
|
||||
GGML_METAL_DECL_KERNEL(silu);
|
||||
GGML_METAL_DECL_KERNEL(soft_max);
|
||||
GGML_METAL_DECL_KERNEL(soft_max_4);
|
||||
GGML_METAL_DECL_KERNEL(diag_mask_inf);
|
||||
|
@ -86,6 +88,7 @@ struct ggml_metal_context {
|
|||
GGML_METAL_DECL_KERNEL(get_rows_q5_K);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_DECL_KERNEL(rms_norm);
|
||||
GGML_METAL_DECL_KERNEL(group_norm);
|
||||
GGML_METAL_DECL_KERNEL(norm);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_f32_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_f16_f16);
|
||||
|
@ -102,6 +105,21 @@ struct ggml_metal_context {
|
|||
GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_id_f32_f32);
|
||||
//GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f16);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32);
|
||||
//GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32_1row);
|
||||
//GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32_l4);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_id_q4_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_id_q4_1_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_id_q5_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_id_q5_1_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_id_q8_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_id_q2_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_id_q3_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_id_q4_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_id_q5_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_id_q6_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
|
||||
|
@ -130,8 +148,11 @@ struct ggml_metal_context {
|
|||
GGML_METAL_DECL_KERNEL(rope_f16);
|
||||
GGML_METAL_DECL_KERNEL(alibi_f32);
|
||||
GGML_METAL_DECL_KERNEL(im2col_f16);
|
||||
GGML_METAL_DECL_KERNEL(upscale_f32);
|
||||
GGML_METAL_DECL_KERNEL(pad_f32);
|
||||
GGML_METAL_DECL_KERNEL(argsort_f32_i32_asc);
|
||||
GGML_METAL_DECL_KERNEL(argsort_f32_i32_desc);
|
||||
GGML_METAL_DECL_KERNEL(leaky_relu_f32);
|
||||
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
||||
GGML_METAL_DECL_KERNEL(cpy_f32_f32);
|
||||
GGML_METAL_DECL_KERNEL(cpy_f32_q8_0);
|
||||
|
@ -140,6 +161,7 @@ struct ggml_metal_context {
|
|||
//GGML_METAL_DECL_KERNEL(cpy_f32_q5_0);
|
||||
//GGML_METAL_DECL_KERNEL(cpy_f32_q5_1);
|
||||
GGML_METAL_DECL_KERNEL(cpy_f16_f16);
|
||||
GGML_METAL_DECL_KERNEL(cpy_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(concat);
|
||||
GGML_METAL_DECL_KERNEL(sqr);
|
||||
GGML_METAL_DECL_KERNEL(sum_rows);
|
||||
|
@ -177,6 +199,8 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
|
|||
ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data);
|
||||
} else {
|
||||
char* buffer2 = malloc(len+1);
|
||||
va_end(args);
|
||||
va_start(args, format);
|
||||
vsnprintf(buffer2, len+1, format, args);
|
||||
buffer2[len] = 0;
|
||||
ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data);
|
||||
|
@ -316,9 +340,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
GGML_METAL_ADD_KERNEL(div_row);
|
||||
GGML_METAL_ADD_KERNEL(scale);
|
||||
GGML_METAL_ADD_KERNEL(scale_4);
|
||||
GGML_METAL_ADD_KERNEL(silu);
|
||||
GGML_METAL_ADD_KERNEL(tanh);
|
||||
GGML_METAL_ADD_KERNEL(relu);
|
||||
GGML_METAL_ADD_KERNEL(gelu);
|
||||
GGML_METAL_ADD_KERNEL(gelu_quick);
|
||||
GGML_METAL_ADD_KERNEL(silu);
|
||||
GGML_METAL_ADD_KERNEL(soft_max);
|
||||
GGML_METAL_ADD_KERNEL(soft_max_4);
|
||||
GGML_METAL_ADD_KERNEL(diag_mask_inf);
|
||||
|
@ -336,6 +362,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
GGML_METAL_ADD_KERNEL(get_rows_q5_K);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_ADD_KERNEL(rms_norm);
|
||||
GGML_METAL_ADD_KERNEL(group_norm);
|
||||
GGML_METAL_ADD_KERNEL(norm);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_f32_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_f16_f16);
|
||||
|
@ -352,6 +379,21 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_id_f32_f32);
|
||||
//GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f16);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32);
|
||||
//GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32_1row);
|
||||
//GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32_l4);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_id_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_id_q4_1_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_id_q5_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_id_q5_1_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_id_q8_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_id_q2_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_id_q3_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_id_q4_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_id_q5_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_id_q6_K_f32);
|
||||
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
||||
|
@ -382,8 +424,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
GGML_METAL_ADD_KERNEL(rope_f16);
|
||||
GGML_METAL_ADD_KERNEL(alibi_f32);
|
||||
GGML_METAL_ADD_KERNEL(im2col_f16);
|
||||
GGML_METAL_ADD_KERNEL(upscale_f32);
|
||||
GGML_METAL_ADD_KERNEL(pad_f32);
|
||||
GGML_METAL_ADD_KERNEL(argsort_f32_i32_asc);
|
||||
GGML_METAL_ADD_KERNEL(argsort_f32_i32_desc);
|
||||
GGML_METAL_ADD_KERNEL(leaky_relu_f32);
|
||||
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
||||
GGML_METAL_ADD_KERNEL(cpy_f32_f32);
|
||||
GGML_METAL_ADD_KERNEL(cpy_f32_q8_0);
|
||||
|
@ -392,6 +437,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
//GGML_METAL_ADD_KERNEL(cpy_f32_q5_0);
|
||||
//GGML_METAL_ADD_KERNEL(cpy_f32_q5_1);
|
||||
GGML_METAL_ADD_KERNEL(cpy_f16_f16);
|
||||
GGML_METAL_ADD_KERNEL(cpy_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(concat);
|
||||
GGML_METAL_ADD_KERNEL(sqr);
|
||||
GGML_METAL_ADD_KERNEL(sum_rows);
|
||||
|
@ -416,9 +462,11 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|||
GGML_METAL_DEL_KERNEL(div_row);
|
||||
GGML_METAL_DEL_KERNEL(scale);
|
||||
GGML_METAL_DEL_KERNEL(scale_4);
|
||||
GGML_METAL_DEL_KERNEL(silu);
|
||||
GGML_METAL_DEL_KERNEL(tanh);
|
||||
GGML_METAL_DEL_KERNEL(relu);
|
||||
GGML_METAL_DEL_KERNEL(gelu);
|
||||
GGML_METAL_DEL_KERNEL(gelu_quick);
|
||||
GGML_METAL_DEL_KERNEL(silu);
|
||||
GGML_METAL_DEL_KERNEL(soft_max);
|
||||
GGML_METAL_DEL_KERNEL(soft_max_4);
|
||||
GGML_METAL_DEL_KERNEL(diag_mask_inf);
|
||||
|
@ -436,6 +484,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|||
GGML_METAL_DEL_KERNEL(get_rows_q5_K);
|
||||
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_DEL_KERNEL(rms_norm);
|
||||
GGML_METAL_DEL_KERNEL(group_norm);
|
||||
GGML_METAL_DEL_KERNEL(norm);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_f32_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_f16_f16);
|
||||
|
@ -452,6 +501,21 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|||
GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_id_f32_f32);
|
||||
//GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f16);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32);
|
||||
//GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32_1row);
|
||||
//GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32_l4);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_id_q4_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_id_q4_1_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_id_q5_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_id_q5_1_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_id_q8_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_id_q2_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_id_q3_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_id_q4_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_id_q5_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_id_q6_K_f32);
|
||||
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
||||
|
@ -482,8 +546,11 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|||
GGML_METAL_DEL_KERNEL(rope_f16);
|
||||
GGML_METAL_DEL_KERNEL(alibi_f32);
|
||||
GGML_METAL_DEL_KERNEL(im2col_f16);
|
||||
GGML_METAL_DEL_KERNEL(upscale_f32);
|
||||
GGML_METAL_DEL_KERNEL(pad_f32);
|
||||
GGML_METAL_DEL_KERNEL(argsort_f32_i32_asc);
|
||||
GGML_METAL_DEL_KERNEL(argsort_f32_i32_desc);
|
||||
GGML_METAL_DEL_KERNEL(leaky_relu_f32);
|
||||
GGML_METAL_DEL_KERNEL(cpy_f32_f16);
|
||||
GGML_METAL_DEL_KERNEL(cpy_f32_f32);
|
||||
GGML_METAL_DEL_KERNEL(cpy_f32_q8_0);
|
||||
|
@ -492,6 +559,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|||
//GGML_METAL_DEL_KERNEL(cpy_f32_q5_0);
|
||||
//GGML_METAL_DEL_KERNEL(cpy_f32_q5_1);
|
||||
GGML_METAL_DEL_KERNEL(cpy_f16_f16);
|
||||
GGML_METAL_DEL_KERNEL(cpy_f16_f32);
|
||||
GGML_METAL_DEL_KERNEL(concat);
|
||||
GGML_METAL_DEL_KERNEL(sqr);
|
||||
GGML_METAL_DEL_KERNEL(sum_rows);
|
||||
|
@ -793,9 +861,11 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
|
|||
switch (op->op) {
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
@ -807,6 +877,7 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
|
|||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_SCALE:
|
||||
|
@ -814,21 +885,50 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
|
|||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_ALIBI:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
return true;
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_CONT:
|
||||
{
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F32:
|
||||
switch (op->type) {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
case GGML_TYPE_F16:
|
||||
switch (op->type) {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
};
|
||||
}
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
return op->ne[0] % 4 == 0;
|
||||
return op->ne[3] == 1;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
|
@ -904,7 +1004,10 @@ void ggml_metal_graph_compute(
|
|||
} break;
|
||||
}
|
||||
|
||||
GGML_ASSERT(ggml_metal_supports_op(dst));
|
||||
if (!ggml_metal_supports_op(dst)) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
|
||||
GGML_ASSERT(!"unsupported op");
|
||||
}
|
||||
|
||||
const int64_t ne00 = src0 ? src0->ne[0] : 0;
|
||||
const int64_t ne01 = src0 ? src0->ne[1] : 0;
|
||||
|
@ -1001,34 +1104,39 @@ void ggml_metal_graph_compute(
|
|||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
{
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
const size_t offs = 0;
|
||||
|
||||
bool bcast_row = false;
|
||||
|
||||
int64_t nb = ne00;
|
||||
|
||||
if (ggml_nelements(src1) == ne10 && ne00 % 4 == 0) {
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
// src1 is a row
|
||||
GGML_ASSERT(ne11 == 1);
|
||||
|
||||
nb = ne00 / 4;
|
||||
switch (dst->op) {
|
||||
case GGML_OP_ADD: [encoder setComputePipelineState:ctx->pipeline_add_row]; break;
|
||||
case GGML_OP_MUL: [encoder setComputePipelineState:ctx->pipeline_mul_row]; break;
|
||||
case GGML_OP_DIV: [encoder setComputePipelineState:ctx->pipeline_div_row]; break;
|
||||
case GGML_OP_ADD: pipeline = ctx->pipeline_add_row; break;
|
||||
case GGML_OP_MUL: pipeline = ctx->pipeline_mul_row; break;
|
||||
case GGML_OP_DIV: pipeline = ctx->pipeline_div_row; break;
|
||||
default: GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
bcast_row = true;
|
||||
} else {
|
||||
switch (dst->op) {
|
||||
case GGML_OP_ADD: [encoder setComputePipelineState:ctx->pipeline_add]; break;
|
||||
case GGML_OP_MUL: [encoder setComputePipelineState:ctx->pipeline_mul]; break;
|
||||
case GGML_OP_DIV: [encoder setComputePipelineState:ctx->pipeline_div]; break;
|
||||
case GGML_OP_ADD: pipeline = ctx->pipeline_add; break;
|
||||
case GGML_OP_MUL: pipeline = ctx->pipeline_mul; break;
|
||||
case GGML_OP_DIV: pipeline = ctx->pipeline_div; break;
|
||||
default: GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
|
@ -1056,18 +1164,99 @@ void ggml_metal_graph_compute(
|
|||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26];
|
||||
[encoder setBytes:&nb length:sizeof(nb) atIndex:27];
|
||||
[encoder setBytes:&offs length:sizeof(offs) atIndex:27];
|
||||
[encoder setBytes:&nb length:sizeof(nb) atIndex:28];
|
||||
|
||||
if (bcast_row) {
|
||||
const int64_t n = ggml_nelements(dst)/4;
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} else {
|
||||
const int nth = MIN(1024, ne0);
|
||||
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_ACC:
|
||||
{
|
||||
GGML_ASSERT(src0t == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dstt == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
|
||||
const size_t pnb1 = ((int32_t *) dst->op_params)[0];
|
||||
const size_t pnb2 = ((int32_t *) dst->op_params)[1];
|
||||
const size_t pnb3 = ((int32_t *) dst->op_params)[2];
|
||||
const size_t offs = ((int32_t *) dst->op_params)[3];
|
||||
|
||||
const bool inplace = (bool) ((int32_t *) dst->op_params)[4];
|
||||
|
||||
if (!inplace) {
|
||||
// run a separete kernel to cpy src->dst
|
||||
// not sure how to avoid this
|
||||
// TODO: make a simpler cpy_bytes kernel
|
||||
|
||||
const int nth = MIN(1024, ne00);
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
}
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_add];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
||||
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
|
||||
[encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8];
|
||||
[encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9];
|
||||
[encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10];
|
||||
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
|
||||
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
|
||||
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
|
||||
[encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
|
||||
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
|
||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
|
||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
|
||||
[encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20];
|
||||
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21];
|
||||
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22];
|
||||
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23];
|
||||
[encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24];
|
||||
[encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25];
|
||||
[encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
|
||||
[encoder setBytes:&offs length:sizeof(offs) atIndex:27];
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_SCALE:
|
||||
{
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
@ -1091,16 +1280,15 @@ void ggml_metal_graph_compute(
|
|||
} break;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(gf->nodes[i])) {
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
{
|
||||
[encoder setComputePipelineState:ctx->pipeline_silu];
|
||||
[encoder setComputePipelineState:ctx->pipeline_tanh];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
GGML_ASSERT(n % 4 == 0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_RELU:
|
||||
{
|
||||
|
@ -1121,6 +1309,28 @@ void ggml_metal_graph_compute(
|
|||
const int64_t n = ggml_nelements(dst);
|
||||
GGML_ASSERT(n % 4 == 0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
{
|
||||
[encoder setComputePipelineState:ctx->pipeline_gelu_quick];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
GGML_ASSERT(n % 4 == 0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_SILU:
|
||||
{
|
||||
[encoder setComputePipelineState:ctx->pipeline_silu];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
GGML_ASSERT(n % 4 == 0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
default:
|
||||
|
@ -1193,7 +1403,11 @@ void ggml_metal_graph_compute(
|
|||
const float scale = ((float *) dst->op_params)[0];
|
||||
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
if (id_src1) {
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
} else {
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
||||
}
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
||||
|
@ -1444,7 +1658,7 @@ void ggml_metal_graph_compute(
|
|||
else if (src0t == GGML_TYPE_Q6_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
} else {
|
||||
int64_t ny = (ne11 + nrows - 1)/nrows;
|
||||
const int64_t ny = (ne11 + nrows - 1)/nrows;
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
}
|
||||
|
@ -1456,7 +1670,7 @@ void ggml_metal_graph_compute(
|
|||
|
||||
GGML_ASSERT(src0t == GGML_TYPE_I32);
|
||||
|
||||
const int n_as = ne00;
|
||||
const int n_as = ((int32_t *) dst->op_params)[1];
|
||||
|
||||
// TODO: make this more general
|
||||
GGML_ASSERT(n_as <= 8);
|
||||
|
@ -1488,14 +1702,22 @@ void ggml_metal_graph_compute(
|
|||
|
||||
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
||||
// to the matrix-vector kernel
|
||||
int ne11_mm_min = 0;
|
||||
int ne11_mm_min = 1;
|
||||
|
||||
const int idx = ((int32_t *) dst->op_params)[0];
|
||||
|
||||
// batch size
|
||||
GGML_ASSERT(ne01 == ne11);
|
||||
|
||||
const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory
|
||||
|
||||
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
||||
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
||||
if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
|
||||
ne11 > ne11_mm_min) {
|
||||
// !!!
|
||||
// TODO: for now, always use mat-vec kernels until we figure out how to improve the
|
||||
// indirect matrix multiplication
|
||||
// !!!
|
||||
if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && _ne1 > ne11_mm_min) {
|
||||
switch (src2->type) {
|
||||
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32]; break;
|
||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32]; break;
|
||||
|
@ -1514,19 +1736,22 @@ void ggml_metal_graph_compute(
|
|||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne20 length:sizeof(ne20) atIndex:3];
|
||||
[encoder setBytes:&ne22 length:sizeof(ne22) atIndex:4];
|
||||
[encoder setBytes:&nb21 length:sizeof(nb21) atIndex:5];
|
||||
[encoder setBytes:&nb22 length:sizeof(nb22) atIndex:6];
|
||||
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
|
||||
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:8];
|
||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9];
|
||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12];
|
||||
[encoder setBytes:&r2 length:sizeof(r2) atIndex:13];
|
||||
[encoder setBytes:&r3 length:sizeof(r3) atIndex:14];
|
||||
[encoder setBytes:&idx length:sizeof(idx) atIndex:15];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3];
|
||||
[encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
|
||||
[encoder setBytes:&ne22 length:sizeof(ne22) atIndex:5];
|
||||
[encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6];
|
||||
[encoder setBytes:&nb22 length:sizeof(nb22) atIndex:7];
|
||||
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:8];
|
||||
[encoder setBytes:&ne13 length:sizeof(ne13) atIndex:9];
|
||||
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
|
||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
|
||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
||||
[encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
||||
[encoder setBytes:&r2 length:sizeof(r2) atIndex:16];
|
||||
[encoder setBytes:&r3 length:sizeof(r3) atIndex:17];
|
||||
[encoder setBytes:&idx length:sizeof(idx) atIndex:18];
|
||||
// TODO: how to make this an array? read Metal docs
|
||||
for (int j = 0; j < n_as; ++j) {
|
||||
struct ggml_tensor * src_cur = dst->src[2 + j];
|
||||
|
@ -1534,11 +1759,157 @@ void ggml_metal_graph_compute(
|
|||
size_t offs_src_cur = 0;
|
||||
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
|
||||
|
||||
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:16 + j];
|
||||
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
|
||||
}
|
||||
|
||||
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne21 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||
|
||||
// TODO: processing one row at a time (ne11 -> 1) is not efficient
|
||||
[encoder dispatchThreadgroups:MTLSizeMake( (_ne1 + 31)/32, (ne21 + 63)/64, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||
} else {
|
||||
int nth0 = 32;
|
||||
int nth1 = 1;
|
||||
int nrows = 1;
|
||||
//printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
||||
|
||||
// use custom matrix x vector kernel
|
||||
switch (src2t) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_f32_f32];
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
||||
nth0 = 32;
|
||||
nth1 = 1;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_f16_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
{
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_0_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
{
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_1_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
{
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_0_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
{
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_1_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
{
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q8_0_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
{
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q2_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
{
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q3_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
{
|
||||
nth0 = 4; //1;
|
||||
nth1 = 8; //32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
{
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
{
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q6_K_f32];
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
|
||||
GGML_ASSERT(false && "not implemented");
|
||||
}
|
||||
};
|
||||
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3];
|
||||
[encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
|
||||
[encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5];
|
||||
[encoder setBytes:&ne22 length:sizeof(ne22) atIndex:6];
|
||||
[encoder setBytes:&nb20 length:sizeof(nb20) atIndex:7];
|
||||
[encoder setBytes:&nb21 length:sizeof(nb21) atIndex:8];
|
||||
[encoder setBytes:&nb22 length:sizeof(nb22) atIndex:9];
|
||||
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
|
||||
[encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:11];
|
||||
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
|
||||
[encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
|
||||
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
|
||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
|
||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:17];
|
||||
[encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:18];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:19];
|
||||
[encoder setBytes:&r2 length:sizeof(r2) atIndex:20];
|
||||
[encoder setBytes:&r3 length:sizeof(r3) atIndex:21];
|
||||
[encoder setBytes:&idx length:sizeof(idx) atIndex:22];
|
||||
// TODO: how to make this an array? read Metal docs
|
||||
for (int j = 0; j < n_as; ++j) {
|
||||
struct ggml_tensor * src_cur = dst->src[2 + j];
|
||||
|
||||
size_t offs_src_cur = 0;
|
||||
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
|
||||
|
||||
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
|
||||
}
|
||||
|
||||
if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
|
||||
src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
|
||||
src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src2t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src2t == GGML_TYPE_Q3_K) {
|
||||
#ifdef GGML_QKK_64
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
#else
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
#endif
|
||||
}
|
||||
else if (src2t == GGML_TYPE_Q5_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src2t == GGML_TYPE_Q6_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
} else {
|
||||
const int64_t ny = (_ne1 + nrows - 1)/nrows;
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne21, ny, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_GET_ROWS:
|
||||
|
@ -1559,16 +1930,19 @@ void ggml_metal_graph_compute(
|
|||
default: GGML_ASSERT(false && "not implemented");
|
||||
}
|
||||
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:5];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5];
|
||||
[encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6];
|
||||
[encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7];
|
||||
[encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:10];
|
||||
|
||||
const int64_t n = ggml_nelements(src1);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_RMS_NORM:
|
||||
{
|
||||
|
@ -1595,6 +1969,38 @@ void ggml_metal_graph_compute(
|
|||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_GROUP_NORM:
|
||||
{
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
|
||||
//float eps;
|
||||
//memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
const float eps = 1e-6f; // TODO: temporarily hardcoded
|
||||
|
||||
const int32_t n_groups = ((int32_t *) dst->op_params)[0];
|
||||
|
||||
int nth = 32; // SIMD width
|
||||
|
||||
//while (nth < ne00/4 && nth < 1024) {
|
||||
// nth *= 2;
|
||||
//}
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_group_norm];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:5];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:6];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8];
|
||||
[encoder setBytes:&eps length:sizeof( float) atIndex:9];
|
||||
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_NORM:
|
||||
{
|
||||
float eps;
|
||||
|
@ -1764,6 +2170,65 @@ void ggml_metal_graph_compute(
|
|||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
|
||||
} break;
|
||||
case GGML_OP_UPSCALE:
|
||||
{
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
|
||||
const int sf = dst->op_params[0];
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_upscale_f32];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
||||
[encoder setBytes:&sf length:sizeof(sf) atIndex:18];
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_PAD:
|
||||
{
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_pad_f32];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_ARGSORT:
|
||||
{
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
|
@ -1785,6 +2250,22 @@ void ggml_metal_graph_compute(
|
|||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
{
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
|
||||
float slope;
|
||||
memcpy(&slope, dst->op_params, sizeof(float));
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_leaky_relu_f32];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&slope length:sizeof(slope) atIndex:2];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_CONT:
|
||||
|
@ -1813,7 +2294,7 @@ void ggml_metal_graph_compute(
|
|||
{
|
||||
switch (dstt) {
|
||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break;
|
||||
case GGML_TYPE_F32: GGML_ASSERT(false && "cpy_f16_f32 not implemented"); break;
|
||||
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f32]; break;
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
};
|
||||
} break;
|
||||
|
|
1616
ggml-metal.metal
1616
ggml-metal.metal
File diff suppressed because it is too large
Load diff
|
@ -3114,7 +3114,7 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
|
|||
|
||||
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
||||
|
||||
// These tempory registers are for masking and shift operations
|
||||
// These temporary registers are for masking and shift operations
|
||||
vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
|
||||
vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
|
||||
|
||||
|
@ -4757,7 +4757,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
|
||||
vl = 16;
|
||||
|
||||
// retreive lane to multiply with scale
|
||||
// retrieve lane to multiply with scale
|
||||
vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
|
||||
vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
|
||||
vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
|
||||
|
|
47
ggml.h
47
ggml.h
|
@ -215,9 +215,9 @@
|
|||
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
||||
|
||||
#define GGML_MAX_DIMS 4
|
||||
#define GGML_MAX_PARAMS 1024
|
||||
#define GGML_MAX_PARAMS 2048
|
||||
#define GGML_MAX_CONTEXTS 64
|
||||
#define GGML_MAX_SRC 6
|
||||
#define GGML_MAX_SRC 10
|
||||
#define GGML_MAX_NAME 64
|
||||
#define GGML_MAX_OP_PARAMS 64
|
||||
#define GGML_DEFAULT_N_THREADS 4
|
||||
|
@ -423,7 +423,9 @@ extern "C" {
|
|||
GGML_OP_POOL_1D,
|
||||
GGML_OP_POOL_2D,
|
||||
GGML_OP_UPSCALE, // nearest interpolate
|
||||
GGML_OP_PAD,
|
||||
GGML_OP_ARGSORT,
|
||||
GGML_OP_LEAKY_RELU,
|
||||
|
||||
GGML_OP_FLASH_ATTN,
|
||||
GGML_OP_FLASH_FF,
|
||||
|
@ -463,7 +465,6 @@ extern "C" {
|
|||
GGML_UNARY_OP_GELU,
|
||||
GGML_UNARY_OP_GELU_QUICK,
|
||||
GGML_UNARY_OP_SILU,
|
||||
GGML_UNARY_OP_LEAKY,
|
||||
|
||||
GGML_UNARY_OP_COUNT,
|
||||
};
|
||||
|
@ -501,7 +502,6 @@ extern "C" {
|
|||
|
||||
struct ggml_backend_buffer * buffer;
|
||||
|
||||
int n_dims;
|
||||
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
||||
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
||||
// nb[0] = ggml_type_size(type)
|
||||
|
@ -533,7 +533,7 @@ extern "C" {
|
|||
|
||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||
|
||||
char padding[12];
|
||||
char padding[8];
|
||||
};
|
||||
|
||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||
|
@ -638,11 +638,14 @@ extern "C" {
|
|||
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
||||
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
||||
|
||||
GGML_API int ggml_blck_size (enum ggml_type type);
|
||||
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
||||
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
||||
GGML_API int ggml_blck_size(enum ggml_type type);
|
||||
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
||||
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
||||
|
||||
GGML_DEPRECATED(
|
||||
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
||||
"use ggml_row_size() instead");
|
||||
|
||||
GGML_API const char * ggml_type_name(enum ggml_type type);
|
||||
GGML_API const char * ggml_op_name (enum ggml_op op);
|
||||
|
@ -661,6 +664,11 @@ extern "C" {
|
|||
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
||||
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
||||
|
||||
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
|
||||
|
@ -793,6 +801,9 @@ extern "C" {
|
|||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// dst = a
|
||||
// view(dst, nb1, nb2, nb3, offset) += b
|
||||
// return dst
|
||||
GGML_API struct ggml_tensor * ggml_acc(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
@ -957,15 +968,14 @@ extern "C" {
|
|||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_leaky(
|
||||
GGML_API struct ggml_tensor * ggml_leaky_relu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
struct ggml_tensor * a, float negative_slope, bool inplace);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// TODO: double-check this computation is correct
|
||||
GGML_API struct ggml_tensor * ggml_gelu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
@ -1051,7 +1061,8 @@ extern "C" {
|
|||
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
||||
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * as[],
|
||||
struct ggml_tensor * const as[],
|
||||
int n_as,
|
||||
struct ggml_tensor * ids,
|
||||
int id,
|
||||
struct ggml_tensor * b);
|
||||
|
@ -1263,6 +1274,7 @@ extern "C" {
|
|||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// supports 3D: a->ne[2] == b->ne[1]
|
||||
GGML_API struct ggml_tensor * ggml_get_rows(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
@ -1549,6 +1561,15 @@ extern "C" {
|
|||
struct ggml_tensor * a,
|
||||
int scale_factor);
|
||||
|
||||
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
||||
GGML_API struct ggml_tensor * ggml_pad(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int p0,
|
||||
int p1,
|
||||
int p2,
|
||||
int p3);
|
||||
|
||||
// sort rows
|
||||
enum ggml_sort_order {
|
||||
GGML_SORT_ASC,
|
||||
|
|
|
@ -61,7 +61,7 @@ If you want to publish the package manually for any reason, you need to have `tw
|
|||
pip install build twine
|
||||
```
|
||||
|
||||
Then, folow these steps to release a new version:
|
||||
Then, follow these steps to release a new version:
|
||||
|
||||
1. Bump the version in `pyproject.toml`.
|
||||
2. Build the package:
|
||||
|
|
|
@ -38,6 +38,8 @@ class Keys:
|
|||
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
||||
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
||||
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||
EXPERT_COUNT = "{arch}.expert_count"
|
||||
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "{arch}.attention.head_count"
|
||||
|
@ -111,10 +113,14 @@ class MODEL_TENSOR(IntEnum):
|
|||
ATTN_NORM = auto()
|
||||
ATTN_NORM_2 = auto()
|
||||
ATTN_ROT_EMBD = auto()
|
||||
FFN_GATE_INP = auto()
|
||||
FFN_NORM = auto()
|
||||
FFN_GATE = auto()
|
||||
FFN_DOWN = auto()
|
||||
FFN_UP = auto()
|
||||
FFN_NORM = auto()
|
||||
FFN_GATE_EXP = auto()
|
||||
FFN_DOWN_EXP = auto()
|
||||
FFN_UP_EXP = auto()
|
||||
ATTN_Q_NORM = auto()
|
||||
ATTN_K_NORM = auto()
|
||||
|
||||
|
@ -154,10 +160,14 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}",
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}",
|
||||
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}",
|
||||
}
|
||||
|
||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
|
@ -172,10 +182,14 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
MODEL_ARCH.GPTNEOX: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
|
|
|
@ -339,6 +339,12 @@ class GGUFWriter:
|
|||
def add_clamp_kqv(self, value: float) -> None:
|
||||
self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
|
||||
|
||||
def add_expert_count(self, count: int) -> None:
|
||||
self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
|
||||
|
||||
def add_expert_used_count(self, count: int) -> None:
|
||||
self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count)
|
||||
|
||||
def add_layer_norm_eps(self, value: float) -> None:
|
||||
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
||||
|
||||
|
|
|
@ -149,6 +149,11 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.ln2", # yi
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP: (
|
||||
"layers.{bid}.feed_forward.gate", # mixtral
|
||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
||||
),
|
||||
|
||||
# Feed-forward up
|
||||
MODEL_TENSOR.FFN_UP: (
|
||||
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
||||
|
@ -164,11 +169,21 @@ class TensorNameMap:
|
|||
"transformer.h.{bid}.mlp.w1", # qwen
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_EXP: (
|
||||
"layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral
|
||||
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
|
||||
),
|
||||
|
||||
# Feed-forward gate
|
||||
MODEL_TENSOR.FFN_GATE: (
|
||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
||||
"layers.{bid}.feed_forward.w1", # llama-pth
|
||||
"transformer.h.{bid}.mlp.w2", # qwen
|
||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
||||
"layers.{bid}.feed_forward.w1", # llama-pth
|
||||
"transformer.h.{bid}.mlp.w2", # qwen
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||
"layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral
|
||||
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral
|
||||
),
|
||||
|
||||
# Feed-forward down
|
||||
|
@ -185,6 +200,11 @@ class TensorNameMap:
|
|||
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||
"layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral
|
||||
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
||||
),
|
||||
|
@ -213,11 +233,14 @@ class TensorNameMap:
|
|||
for tensor, keys in self.block_mappings_cfg.items():
|
||||
if tensor not in MODEL_TENSORS[arch]:
|
||||
continue
|
||||
tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
|
||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||
for key in keys:
|
||||
key = key.format(bid = bid)
|
||||
self.mapping[key] = (tensor, tensor_name)
|
||||
# TODO: make this configurable
|
||||
n_experts = 8
|
||||
for xid in range(n_experts):
|
||||
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
|
||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||
for key in keys:
|
||||
key = key.format(bid = bid, xid = xid)
|
||||
self.mapping[key] = (tensor, tensor_name)
|
||||
|
||||
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
|
||||
result = self.mapping.get(key)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.6.0"
|
||||
version = "0.7.0"
|
||||
description = "Read and write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
|
|
381
llama.cpp
381
llama.cpp
|
@ -95,7 +95,8 @@
|
|||
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
||||
#endif
|
||||
|
||||
#define LLAMA_MAX_NODES 8192
|
||||
#define LLAMA_MAX_NODES 8192
|
||||
#define LLAMA_MAX_EXPERTS 8
|
||||
|
||||
//
|
||||
// logging
|
||||
|
@ -235,6 +236,8 @@ enum llm_kv {
|
|||
LLM_KV_FEED_FORWARD_LENGTH,
|
||||
LLM_KV_USE_PARALLEL_RESIDUAL,
|
||||
LLM_KV_TENSOR_DATA_LAYOUT,
|
||||
LLM_KV_EXPERT_COUNT,
|
||||
LLM_KV_EXPERT_USED_COUNT,
|
||||
|
||||
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||
|
@ -285,6 +288,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|||
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
||||
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
||||
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
||||
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
||||
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
||||
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
|
@ -342,10 +347,14 @@ enum llm_tensor {
|
|||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_NORM_2,
|
||||
LLM_TENSOR_ATTN_ROT_EMBD,
|
||||
LLM_TENSOR_FFN_GATE_INP,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_DOWN_EXP,
|
||||
LLM_TENSOR_FFN_GATE_EXP,
|
||||
LLM_TENSOR_FFN_UP_EXP,
|
||||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
LLM_TENSOR_ATTN_K_NORM,
|
||||
};
|
||||
|
@ -364,10 +373,14 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
||||
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -589,6 +602,10 @@ struct LLM_TN {
|
|||
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
||||
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
||||
}
|
||||
|
||||
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
||||
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
|
@ -1172,6 +1189,8 @@ struct llama_hparams {
|
|||
uint32_t n_layer;
|
||||
uint32_t n_rot;
|
||||
uint32_t n_ff;
|
||||
uint32_t n_expert = 0;
|
||||
uint32_t n_expert_used = 0;
|
||||
|
||||
float f_norm_eps;
|
||||
float f_norm_rms_eps;
|
||||
|
@ -1186,15 +1205,18 @@ struct llama_hparams {
|
|||
float f_max_alibi_bias;
|
||||
|
||||
bool operator!=(const llama_hparams & other) const {
|
||||
if (this->vocab_only != other.vocab_only) return true;
|
||||
if (this->n_vocab != other.n_vocab) return true;
|
||||
if (this->n_ctx_train != other.n_ctx_train) return true;
|
||||
if (this->n_embd != other.n_embd) return true;
|
||||
if (this->n_head != other.n_head) return true;
|
||||
if (this->n_head_kv != other.n_head_kv) return true;
|
||||
if (this->n_layer != other.n_layer) return true;
|
||||
if (this->n_rot != other.n_rot) return true;
|
||||
if (this->n_ff != other.n_ff) return true;
|
||||
if (this->vocab_only != other.vocab_only) return true;
|
||||
if (this->n_vocab != other.n_vocab) return true;
|
||||
if (this->n_ctx_train != other.n_ctx_train) return true;
|
||||
if (this->n_embd != other.n_embd) return true;
|
||||
if (this->n_head != other.n_head) return true;
|
||||
if (this->n_head_kv != other.n_head_kv) return true;
|
||||
if (this->n_layer != other.n_layer) return true;
|
||||
if (this->n_rot != other.n_rot) return true;
|
||||
if (this->n_ff != other.n_ff) return true;
|
||||
if (this->n_expert != other.n_expert) return true;
|
||||
if (this->n_expert_used != other.n_expert_used) return true;
|
||||
|
||||
if (this->rope_finetuned != other.rope_finetuned) return true;
|
||||
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
|
||||
|
||||
|
@ -1276,6 +1298,12 @@ struct llama_layer {
|
|||
struct ggml_tensor * ffn_down; // w2
|
||||
struct ggml_tensor * ffn_up; // w3
|
||||
|
||||
// ff MoE
|
||||
struct ggml_tensor * ffn_gate_inp;
|
||||
struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
|
||||
struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
|
||||
struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
|
||||
|
||||
// ff bias
|
||||
struct ggml_tensor * ffn_down_b; // b2
|
||||
struct ggml_tensor * ffn_up_b; // b3
|
||||
|
@ -1495,6 +1523,10 @@ struct llama_context {
|
|||
|
||||
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
||||
std::vector<float> logits;
|
||||
#ifndef NDEBUG
|
||||
// guard against access to unset logits
|
||||
std::vector<bool> logits_valid;
|
||||
#endif
|
||||
bool logits_all = false;
|
||||
|
||||
// input embedding (1-dimensional array: [n_embd])
|
||||
|
@ -1545,7 +1577,7 @@ static bool llama_kv_cache_init(
|
|||
cache.cells.clear();
|
||||
cache.cells.resize(n_ctx);
|
||||
|
||||
cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead());
|
||||
cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead());
|
||||
memset(cache.buf.data, 0, cache.buf.size);
|
||||
|
||||
struct ggml_init_params params;
|
||||
|
@ -2473,6 +2505,16 @@ static void llm_load_hparams(
|
|||
ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
||||
ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
|
||||
ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||
ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||
|
||||
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
|
||||
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
|
||||
if (hparams.n_expert > 0) {
|
||||
GGML_ASSERT(hparams.n_expert_used > 0);
|
||||
} else {
|
||||
GGML_ASSERT(hparams.n_expert_used == 0);
|
||||
}
|
||||
|
||||
// n_head_kv is optional, default to n_head
|
||||
hparams.n_head_kv = hparams.n_head;
|
||||
|
@ -2791,7 +2833,7 @@ static void llm_load_vocab(
|
|||
// The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
|
||||
// to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
|
||||
// are special tokens.
|
||||
// From testing, this appears to corelate 1:1 with special tokens.
|
||||
// From testing, this appears to correlate 1:1 with special tokens.
|
||||
//
|
||||
|
||||
// Counting special tokens and verifying in only one direction
|
||||
|
@ -2904,6 +2946,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|||
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
||||
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
||||
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
||||
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
||||
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
||||
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
||||
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
||||
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
||||
|
@ -3062,9 +3106,26 @@ static void llm_load_tensors(
|
|||
|
||||
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
|
||||
|
||||
if (layer.ffn_gate_inp == nullptr) {
|
||||
GGML_ASSERT(hparams.n_expert == 0);
|
||||
GGML_ASSERT(hparams.n_expert_used == 0);
|
||||
|
||||
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
} else {
|
||||
GGML_ASSERT(hparams.n_expert > 0);
|
||||
GGML_ASSERT(hparams.n_expert_used > 0);
|
||||
|
||||
// MoE branch
|
||||
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
|
||||
layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
|
||||
layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split);
|
||||
layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
|
||||
}
|
||||
}
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
vram_weights +=
|
||||
|
@ -3074,8 +3135,18 @@ static void llm_load_tensors(
|
|||
(layer.bk ? ggml_nbytes(layer.bk) : 0) +
|
||||
(layer.bv ? ggml_nbytes(layer.bv) : 0) +
|
||||
(layer.bo ? ggml_nbytes(layer.bo) : 0) +
|
||||
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
|
||||
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
||||
ggml_nbytes(layer.ffn_norm);
|
||||
|
||||
if (layer.ffn_gate_inp == nullptr) {
|
||||
vram_weights +=
|
||||
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
||||
} else {
|
||||
vram_weights += ggml_nbytes(layer.ffn_gate_inp);
|
||||
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
|
||||
vram_weights +=
|
||||
ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} break;
|
||||
|
@ -3789,8 +3860,8 @@ static void llm_build_k_shift(
|
|||
ggml_rope_custom_inplace(ctx,
|
||||
ggml_view_3d(ctx, kv.k_l[il],
|
||||
n_embd_head, n_head_kv, n_ctx,
|
||||
ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
|
||||
ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_head),
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
|
||||
0),
|
||||
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
@ -3819,7 +3890,7 @@ static void llm_build_kv_store(
|
|||
cb(v_cur_t, "v_cur_t", il);
|
||||
|
||||
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
|
||||
(ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head);
|
||||
(ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
|
||||
cb(k_cache_view, "k_cache_view", il);
|
||||
|
||||
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
|
||||
|
@ -3978,8 +4049,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|||
struct ggml_tensor * k =
|
||||
ggml_view_3d(ctx, kv.k_l[il],
|
||||
n_embd_head, n_kv, n_head_kv,
|
||||
ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
|
||||
ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_head),
|
||||
0);
|
||||
cb(k, "k", il);
|
||||
|
||||
|
@ -4053,6 +4124,8 @@ struct llm_build_context {
|
|||
const int64_t n_head_kv;
|
||||
const int64_t n_embd_head;
|
||||
const int64_t n_embd_gqa;
|
||||
const int64_t n_expert;
|
||||
const int64_t n_expert_used;
|
||||
|
||||
const float freq_base;
|
||||
const float freq_scale;
|
||||
|
@ -4094,6 +4167,8 @@ struct llm_build_context {
|
|||
n_head_kv (hparams.n_head_kv),
|
||||
n_embd_head (hparams.n_embd_head()),
|
||||
n_embd_gqa (hparams.n_embd_gqa()),
|
||||
n_expert (hparams.n_expert),
|
||||
n_expert_used (hparams.n_expert_used),
|
||||
freq_base (cparams.rope_freq_base),
|
||||
freq_scale (cparams.rope_freq_scale),
|
||||
ext_factor (cparams.yarn_ext_factor),
|
||||
|
@ -4218,7 +4293,7 @@ struct llm_build_context {
|
|||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
{
|
||||
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
|
@ -4230,6 +4305,69 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_down, NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
// MoE branch
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
||||
cb(logits, "ffn_moe_logits", il);
|
||||
|
||||
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
|
||||
cb(probs, "ffn_moe_probs", il);
|
||||
|
||||
// select experts
|
||||
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
|
||||
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
||||
|
||||
ggml_tensor * weights = ggml_get_rows(ctx0,
|
||||
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
||||
cb(weights, "ffn_moe_weights", il);
|
||||
|
||||
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
||||
|
||||
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
||||
cb(weights_sum, "ffn_moe_weights_sum", il);
|
||||
|
||||
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
||||
cb(weights, "ffn_moe_weights_norm", il);
|
||||
|
||||
// compute expert outputs
|
||||
ggml_tensor * moe_out = nullptr;
|
||||
|
||||
for (int i = 0; i < n_expert_used; ++i) {
|
||||
ggml_tensor * cur_expert;
|
||||
|
||||
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
|
||||
cb(cur_up, "ffn_moe_up", il);
|
||||
|
||||
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
|
||||
cb(cur_gate, "ffn_moe_gate", il);
|
||||
|
||||
cur_gate = ggml_silu(ctx0, cur_gate);
|
||||
cb(cur_gate, "ffn_moe_silu", il);
|
||||
|
||||
cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
|
||||
cb(cur_expert, "ffn_moe_gate_par", il);
|
||||
|
||||
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
||||
cb(cur_expert, "ffn_moe_down", il);
|
||||
|
||||
cur_expert = ggml_mul(ctx0, cur_expert,
|
||||
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
||||
cb(cur_expert, "ffn_moe_weighted", il);
|
||||
|
||||
if (i == 0) {
|
||||
moe_out = cur_expert;
|
||||
} else {
|
||||
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
}
|
||||
}
|
||||
|
||||
cur = moe_out;
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
@ -5484,6 +5622,20 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|||
{ "ffn_relu", OFFLOAD_FUNC },
|
||||
{ "ffn_sqr(relu)", OFFLOAD_FUNC },
|
||||
|
||||
{ "ffn_moe_logits", OFFLOAD_FUNC },
|
||||
{ "ffn_moe_probs", OFFLOAD_FUNC },
|
||||
{ "ffn_moe_argsort", OFFLOAD_FUNC },
|
||||
{ "ffn_moe_weights", OFFLOAD_FUNC },
|
||||
{ "ffn_moe_weights_sum", OFFLOAD_FUNC },
|
||||
{ "ffn_moe_weights_norm", OFFLOAD_FUNC },
|
||||
{ "ffn_moe_weighted", OFFLOAD_FUNC },
|
||||
{ "ffn_moe_up", OFFLOAD_FUNC },
|
||||
{ "ffn_moe_gate", OFFLOAD_FUNC },
|
||||
{ "ffn_moe_silu", OFFLOAD_FUNC },
|
||||
{ "ffn_moe_gate_par", OFFLOAD_FUNC },
|
||||
{ "ffn_moe_down", OFFLOAD_FUNC },
|
||||
{ "ffn_moe_out", OFFLOAD_FUNC },
|
||||
|
||||
{ "l_out", OFFLOAD_FUNC },
|
||||
|
||||
{ "result_norm", OFFLOAD_FUNC_EMB },
|
||||
|
@ -5888,7 +6040,7 @@ static int llama_decode_internal(
|
|||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_vocab = hparams.n_vocab;
|
||||
|
||||
// helpers for smoother batch API transistion
|
||||
// helpers for smoother batch API transition
|
||||
// after deprecating the llama_eval calls, these will be removed
|
||||
std::vector<llama_pos> pos;
|
||||
|
||||
|
@ -6068,6 +6220,14 @@ static int llama_decode_internal(
|
|||
{
|
||||
auto & logits_out = lctx.logits;
|
||||
|
||||
#ifndef NDEBUG
|
||||
auto & logits_valid = lctx.logits_valid;
|
||||
logits_valid.clear();
|
||||
logits_valid.resize(n_tokens);
|
||||
|
||||
logits_out.clear();
|
||||
#endif
|
||||
|
||||
if (batch.logits) {
|
||||
logits_out.resize(n_vocab * n_tokens);
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
|
@ -6075,13 +6235,22 @@ static int llama_decode_internal(
|
|||
continue;
|
||||
}
|
||||
memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
|
||||
#ifndef NDEBUG
|
||||
logits_valid[i] = true;
|
||||
#endif
|
||||
}
|
||||
} else if (lctx.logits_all) {
|
||||
logits_out.resize(n_vocab * n_tokens);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
|
||||
#ifndef NDEBUG
|
||||
std::fill(logits_valid.begin(), logits_valid.end(), true);
|
||||
#endif
|
||||
} else {
|
||||
logits_out.resize(n_vocab);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
|
||||
#ifndef NDEBUG
|
||||
logits_valid[n_tokens - 1] = true;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -6691,12 +6860,12 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|||
|
||||
// loop over the text
|
||||
while (true) {
|
||||
// find the first occurence of a given special token in this fragment
|
||||
// find the first occurrence of a given special token in this fragment
|
||||
// passing offset argument only limit the "search area" but match coordinates
|
||||
// are still relative to the source full raw_text
|
||||
auto match = raw_text->find(special_token, raw_text_base_offset);
|
||||
|
||||
// no occurences found, stop processing this fragment for a given special token
|
||||
// no occurrences found, stop processing this fragment for a given special token
|
||||
if (match == std::string::npos) break;
|
||||
|
||||
// check if match is within bounds of offset <-> length
|
||||
|
@ -7895,7 +8064,7 @@ struct llama_beam_search_data {
|
|||
}
|
||||
|
||||
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
||||
// The repetative patterns below reflect the 2 stages of heaps:
|
||||
// The repetitive patterns below reflect the 2 stages of heaps:
|
||||
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
||||
// * If the heap is full and a new element is found that should be included, pop the
|
||||
// least element to the back(), replace it with the new, then push it into the heap.
|
||||
|
@ -8133,11 +8302,9 @@ static void llama_convert_tensor_internal(
|
|||
workers.clear();
|
||||
}
|
||||
|
||||
static ggml_type get_k_quant_type(
|
||||
quantize_state_internal & qs,
|
||||
ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
|
||||
) {
|
||||
static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
|
||||
const std::string name = ggml_get_name(tensor);
|
||||
|
||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||
const llm_arch arch = qs.model.arch;
|
||||
const auto tn = LLM_TN(arch);
|
||||
|
@ -8171,7 +8338,18 @@ static ggml_type get_k_quant_type(
|
|||
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
||||
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
if (qs.model.hparams.n_expert == 8) {
|
||||
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
||||
// TODO: explore better strategies
|
||||
new_type = GGML_TYPE_Q8_0;
|
||||
}
|
||||
++qs.i_attention_wv;
|
||||
} else if (name.find("attn_k.weight") != std::string::npos) {
|
||||
if (qs.model.hparams.n_expert == 8) {
|
||||
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
||||
// TODO: explore better strategies
|
||||
new_type = GGML_TYPE_Q8_0;
|
||||
}
|
||||
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
|
@ -8380,10 +8558,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
||||
|
||||
// quantize only 2D tensors
|
||||
quantize &= (tensor->n_dims == 2);
|
||||
quantize &= (ggml_n_dims(tensor) == 2);
|
||||
quantize &= params->quantize_output_tensor || name != "output.weight";
|
||||
quantize &= !params->only_copy;
|
||||
|
||||
// do not quantize expert gating tensors
|
||||
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
||||
|
||||
enum ggml_type new_type;
|
||||
void * new_data;
|
||||
size_t new_size;
|
||||
|
@ -8532,53 +8713,60 @@ static int llama_apply_lora_from_file_internal(
|
|||
|
||||
const int64_t t_start_lora_us = ggml_time_us();
|
||||
|
||||
auto fin = std::ifstream(path_lora, std::ios::binary);
|
||||
if (!fin) {
|
||||
LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
|
||||
return 1;
|
||||
}
|
||||
llama_file fin(path_lora, "rb");
|
||||
|
||||
// verify magic and version
|
||||
{
|
||||
uint32_t magic;
|
||||
fin.read((char *) &magic, sizeof(magic));
|
||||
uint32_t format_version;
|
||||
fin.read((char *) &format_version, sizeof(format_version));
|
||||
uint32_t magic = fin.read_u32();
|
||||
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
||||
LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
uint32_t format_version = fin.read_u32();
|
||||
if (format_version != 1) {
|
||||
LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t lora_r;
|
||||
int32_t lora_alpha;
|
||||
fin.read((char *) &lora_r, sizeof(lora_r));
|
||||
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
||||
int32_t lora_r = fin.read_u32();
|
||||
int32_t lora_alpha = fin.read_u32();
|
||||
float scaling = scale * (float)lora_alpha / (float)lora_r;
|
||||
|
||||
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
||||
|
||||
// create a name -> tensor map of the model to accelerate lookups
|
||||
// find the max tensor size to estimate the required temporary buffer size
|
||||
size_t max_tensor_size = 0;
|
||||
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
||||
for (const auto & kv : model.tensors_by_name) {
|
||||
model_tensors.insert(kv);
|
||||
size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
|
||||
max_tensor_size = std::max(max_tensor_size, f32_size);
|
||||
}
|
||||
|
||||
// create a temporary ggml context to store the lora tensors
|
||||
// todo: calculate size from biggest possible tensor
|
||||
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
|
||||
// TODO: use ggml-alloc
|
||||
size_t lora_ctx_size = max_tensor_size * 3;
|
||||
LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
|
||||
std::vector<uint8_t> lora_buf(lora_ctx_size);
|
||||
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = lora_buf.size();
|
||||
params.mem_buffer = lora_buf.data();
|
||||
params.no_alloc = false;
|
||||
|
||||
ggml_context * lora_ctx = ggml_init(params);
|
||||
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
||||
using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
|
||||
|
||||
// create a name -> tensor map of the model to accelerate lookups
|
||||
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
||||
for (const auto & kv : model.tensors_by_name) {
|
||||
model_tensors.insert(kv);
|
||||
}
|
||||
unique_context lora_ctx(nullptr, ggml_free);
|
||||
lora_ctx.reset(ggml_init(params));
|
||||
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
||||
|
||||
// load base model
|
||||
std::unique_ptr<llama_model_loader> ml;
|
||||
ggml_context * base_ctx = NULL;
|
||||
|
||||
unique_context base_ctx(nullptr, ggml_free);
|
||||
std::vector<uint8_t> base_buf;
|
||||
if (path_base_model) {
|
||||
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
||||
|
@ -8587,6 +8775,7 @@ static int llama_apply_lora_from_file_internal(
|
|||
size_t ctx_size;
|
||||
size_t mmapped_size;
|
||||
ml->calc_sizes(ctx_size, mmapped_size);
|
||||
|
||||
base_buf.resize(ctx_size);
|
||||
|
||||
ggml_init_params base_params;
|
||||
|
@ -8594,9 +8783,9 @@ static int llama_apply_lora_from_file_internal(
|
|||
base_params.mem_buffer = base_buf.data();
|
||||
base_params.no_alloc = ml->use_mmap;
|
||||
|
||||
base_ctx = ggml_init(base_params);
|
||||
base_ctx.reset(ggml_init(base_params));
|
||||
|
||||
// maybe this should in llama_model_loader
|
||||
// maybe this should be in llama_model_loader
|
||||
if (ml->use_mmap) {
|
||||
ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa()));
|
||||
}
|
||||
|
@ -8609,27 +8798,35 @@ static int llama_apply_lora_from_file_internal(
|
|||
std::vector<uint8_t> work_buffer;
|
||||
|
||||
while (true) {
|
||||
if (fin.tell() == fin.size) {
|
||||
// eof
|
||||
break;
|
||||
}
|
||||
|
||||
int32_t n_dims;
|
||||
int32_t length;
|
||||
int32_t name_len;
|
||||
int32_t ftype;
|
||||
|
||||
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
||||
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
||||
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
||||
if (fin.eof()) {
|
||||
break;
|
||||
fin.read_raw(&n_dims, sizeof(n_dims));
|
||||
fin.read_raw(&name_len, sizeof(name_len));
|
||||
fin.read_raw(&ftype, sizeof(ftype));
|
||||
|
||||
if (n_dims != 1 && n_dims != 2) {
|
||||
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int32_t ne[2] = { 1, 1 };
|
||||
for (int i = 0; i < n_dims; ++i) {
|
||||
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
||||
fin.read_raw(&ne[i], sizeof(ne[i]));
|
||||
}
|
||||
|
||||
std::string name;
|
||||
{
|
||||
GGML_ASSERT(name_len <= 1024);
|
||||
char buf[1024];
|
||||
fin.read(buf, length);
|
||||
name = std::string(buf, length);
|
||||
fin.read_raw(buf, name_len);
|
||||
name = std::string(buf, name_len);
|
||||
}
|
||||
|
||||
// check for lora suffix and get the type of tensor
|
||||
|
@ -8643,7 +8840,7 @@ static int llama_apply_lora_from_file_internal(
|
|||
std::string lora_type = name.substr(pos + lora_suffix.length());
|
||||
std::string base_name = name;
|
||||
base_name.erase(pos);
|
||||
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
||||
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
|
||||
|
||||
if (model_tensors.find(base_name) == model_tensors.end()) {
|
||||
LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
||||
|
@ -8662,22 +8859,15 @@ static int llama_apply_lora_from_file_internal(
|
|||
return false;
|
||||
}
|
||||
}
|
||||
ggml_tensor * lora_tensor;
|
||||
if (n_dims == 2) {
|
||||
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
||||
}
|
||||
else {
|
||||
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
||||
return 1;
|
||||
}
|
||||
ggml_set_name(lora_tensor, "lora_tensor");
|
||||
ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
|
||||
ggml_set_name(lora_tensor, name.c_str());
|
||||
|
||||
// load tensor data
|
||||
size_t offset = fin.tellg();
|
||||
size_t offset = fin.tell();
|
||||
size_t tensor_data_size = ggml_nbytes(lora_tensor);
|
||||
offset = (offset + 31) & -32;
|
||||
fin.seekg(offset);
|
||||
fin.read((char*)lora_tensor->data, tensor_data_size);
|
||||
fin.seek(offset, SEEK_SET);
|
||||
fin.read_raw(lora_tensor->data, tensor_data_size);
|
||||
|
||||
lora_tensors[name] = lora_tensor;
|
||||
|
||||
|
@ -8707,13 +8897,11 @@ static int llama_apply_lora_from_file_internal(
|
|||
|
||||
// load from base model
|
||||
if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
|
||||
// TODO: throw
|
||||
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
// TODO: not tested!! maybe not working!
|
||||
base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
||||
base_t = ml->create_tensor(base_ctx.get(), base_name, { dest_t->ne[0], dest_t->ne[1] }, GGML_BACKEND_CPU);
|
||||
ml->load_data_for(base_t);
|
||||
} else {
|
||||
base_t = dest_t;
|
||||
|
@ -8742,43 +8930,45 @@ static int llama_apply_lora_from_file_internal(
|
|||
}
|
||||
|
||||
// w = w + BA*s
|
||||
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
||||
ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB);
|
||||
offload_func(BA);
|
||||
ggml_set_name(BA, "BA");
|
||||
|
||||
if (scaling != 1.0f) {
|
||||
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
||||
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx.get(), scaling);
|
||||
ggml_set_name(scale_tensor, "scale_tensor");
|
||||
|
||||
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
||||
BA = ggml_scale_inplace(lora_ctx.get(), BA, scale_tensor);
|
||||
offload_func(BA);
|
||||
ggml_set_name(BA, "BA_scaled");
|
||||
}
|
||||
|
||||
ggml_tensor * r;
|
||||
if (base_t == dest_t) {
|
||||
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
||||
r = ggml_add_inplace(lora_ctx.get(), dest_t, BA);
|
||||
offload_func_force_inplace(r);
|
||||
ggml_set_name(r, "r_add_inplace");
|
||||
}
|
||||
else {
|
||||
r = ggml_add(lora_ctx, base_t, BA);
|
||||
r = ggml_add(lora_ctx.get(), base_t, BA);
|
||||
offload_func(r);
|
||||
ggml_set_name(r, "r_add");
|
||||
|
||||
r = ggml_cpy(lora_ctx, r, dest_t);
|
||||
r = ggml_cpy(lora_ctx.get(), r, dest_t);
|
||||
offload_func(r);
|
||||
ggml_set_name(r, "r_cpy");
|
||||
}
|
||||
|
||||
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
||||
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get());
|
||||
ggml_build_forward_expand(gf, r);
|
||||
|
||||
ggml_graph_compute_helper(work_buffer, gf, n_threads);
|
||||
|
||||
// the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other
|
||||
GGML_ASSERT(lora_tensors.size() == 2);
|
||||
|
||||
// we won't need these tensors again, reset the context to save memory
|
||||
ggml_free(lora_ctx);
|
||||
lora_ctx = ggml_init(params);
|
||||
lora_ctx.reset(ggml_init(params));
|
||||
lora_tensors.clear();
|
||||
|
||||
n_tensors++;
|
||||
|
@ -8788,12 +8978,6 @@ static int llama_apply_lora_from_file_internal(
|
|||
}
|
||||
}
|
||||
|
||||
// TODO: this should be in a destructor, it will leak on failure
|
||||
ggml_free(lora_ctx);
|
||||
if (base_ctx) {
|
||||
ggml_free(base_ctx);
|
||||
}
|
||||
|
||||
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
||||
LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
|
||||
|
||||
|
@ -9958,6 +10142,7 @@ float * llama_get_logits(struct llama_context * ctx) {
|
|||
}
|
||||
|
||||
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
||||
assert(ctx->logits_valid.at(i));
|
||||
return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
|
||||
}
|
||||
|
||||
|
|
3
llama.h
3
llama.h
|
@ -39,6 +39,7 @@
|
|||
|
||||
#define LLAMA_MAX_RNG_STATE (64*1024)
|
||||
|
||||
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
|
@ -216,7 +217,7 @@ extern "C" {
|
|||
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||
bool embedding; // embedding mode only
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
};
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
numpy==1.24.4
|
||||
sentencepiece==0.1.98
|
||||
transformers>=4.34.0
|
||||
gguf>=0.1.0
|
||||
protobuf>=4.21.0
|
||||
|
|
38
scripts/get-flags.mk
Normal file
38
scripts/get-flags.mk
Normal file
|
@ -0,0 +1,38 @@
|
|||
ifeq '' '$(findstring clang,$(shell $(GF_CC) --version))'
|
||||
GF_CC_IS_GCC = 1
|
||||
GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null || $(GF_CC) -dumpversion; } | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
|
||||
else
|
||||
GF_CC_IS_CLANG = 1
|
||||
ifeq '' '$(findstring Apple,$(shell $(GF_CC) --version))'
|
||||
GF_CC_IS_LLVM_CLANG = 1
|
||||
else
|
||||
GF_CC_IS_APPLE_CLANG = 1
|
||||
endif
|
||||
GF_CC_VER := \
|
||||
$(shell $(GF_CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
|
||||
| awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
|
||||
endif
|
||||
|
||||
ifeq ($(GF_CC_IS_CLANG), 1)
|
||||
# clang options
|
||||
GF_CFLAGS = -Wunreachable-code-break -Wunreachable-code-return
|
||||
GF_CXXFLAGS = -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
|
||||
|
||||
ifneq '' '$(and $(GF_CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(GF_CC_VER) \>= 030800)))'
|
||||
GF_CFLAGS += -Wdouble-promotion
|
||||
endif
|
||||
ifneq '' '$(and $(GF_CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(GF_CC_VER) \>= 070300)))'
|
||||
GF_CFLAGS += -Wdouble-promotion
|
||||
endif
|
||||
else
|
||||
# gcc options
|
||||
GF_CFLAGS = -Wdouble-promotion
|
||||
GF_CXXFLAGS = -Wno-array-bounds
|
||||
|
||||
ifeq ($(shell expr $(GF_CC_VER) \>= 070100), 1)
|
||||
GF_CXXFLAGS += -Wno-format-truncation
|
||||
endif
|
||||
ifeq ($(shell expr $(GF_CC_VER) \>= 080100), 1)
|
||||
GF_CXXFLAGS += -Wextra-semi
|
||||
endif
|
||||
endif
|
|
@ -20,8 +20,6 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
|||
size_t size = ggml_nelements(tensor);
|
||||
std::vector<float> data(size);
|
||||
|
||||
std::random_device rd;
|
||||
|
||||
#if 0
|
||||
std::default_random_engine generator(rd());
|
||||
std::uniform_real_distribution<float> distribution(min, max);
|
||||
|
@ -31,6 +29,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
|||
}
|
||||
#endif
|
||||
auto init_thread = [&](size_t start, size_t end) {
|
||||
std::random_device rd;
|
||||
std::default_random_engine generator(rd());
|
||||
std::uniform_real_distribution<float> distribution(min, max);
|
||||
|
||||
|
@ -51,11 +50,11 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
|||
t.join();
|
||||
}
|
||||
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
||||
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
|
||||
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) {
|
||||
GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
|
||||
std::vector<uint8_t> dataq(ggml_type_size(tensor->type)*size/ggml_blck_size(tensor->type));
|
||||
std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
|
||||
int64_t hist[16];
|
||||
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size, hist);
|
||||
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
||||
|
@ -71,23 +70,29 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
|||
std::vector<uint8_t> buf(ggml_nbytes(t));
|
||||
ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
|
||||
|
||||
ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
|
||||
size_t bs = ggml_blck_size(t->type);
|
||||
std::vector<float> vq(ggml_blck_size(t->type));
|
||||
bool quantized = ggml_is_quantized(t->type);
|
||||
|
||||
// access elements by index to avoid gaps in views
|
||||
for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
|
||||
for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
|
||||
for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
|
||||
for (int64_t i0 = 0; i0 < t->ne[0]; i0++) {
|
||||
size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
|
||||
float v;
|
||||
for (int64_t i0 = 0; i0 < t->ne[0]; i0 += bs) {
|
||||
size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
|
||||
if (t->type == GGML_TYPE_F16) {
|
||||
v = (float) ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]);
|
||||
tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
|
||||
} else if (t->type == GGML_TYPE_F32) {
|
||||
v = *(float *) &buf[i];
|
||||
tv.push_back(*(float *) &buf[i]);
|
||||
} else if (t->type == GGML_TYPE_I32) {
|
||||
v = *(int32_t *) &buf[i];
|
||||
tv.push_back((float)*(int32_t *) &buf[i]);
|
||||
} else if (quantized) {
|
||||
tt.to_float(&buf[i], vq.data(), bs);
|
||||
tv.insert(tv.end(), vq.begin(), vq.end());
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
tv.push_back(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -230,9 +235,18 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
|||
return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
|
||||
}
|
||||
|
||||
enum test_mode {
|
||||
MODE_TEST,
|
||||
MODE_PERF,
|
||||
};
|
||||
|
||||
struct test_case {
|
||||
virtual ~test_case() {}
|
||||
|
||||
virtual std::string op_desc(ggml_tensor * t) {
|
||||
return ggml_op_desc(t);
|
||||
}
|
||||
|
||||
virtual std::string vars() {
|
||||
return "";
|
||||
}
|
||||
|
@ -240,7 +254,7 @@ struct test_case {
|
|||
virtual ggml_tensor * build_graph(ggml_context * ctx) = 0;
|
||||
|
||||
virtual double max_nmse_err() {
|
||||
return 1e-6;
|
||||
return 1e-7;
|
||||
}
|
||||
|
||||
virtual void initialize_tensors(ggml_context * ctx) {
|
||||
|
@ -260,7 +274,58 @@ struct test_case {
|
|||
return size;
|
||||
}
|
||||
|
||||
ggml_cgraph * gf = nullptr;
|
||||
|
||||
static const int sentinel_size = 1024;
|
||||
|
||||
test_mode mode;
|
||||
|
||||
std::vector<ggml_tensor *> sentinels;
|
||||
|
||||
void add_sentinel(ggml_context * ctx) {
|
||||
if (mode == MODE_PERF) {
|
||||
return;
|
||||
}
|
||||
ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size);
|
||||
ggml_format_name(sentinel, "sent_%zu", sentinels.size());
|
||||
sentinels.push_back(sentinel);
|
||||
}
|
||||
|
||||
// hijack ggml_new_tensor to add sentinels after each tensor to check for overflows in the backend
|
||||
|
||||
ggml_tensor * ggml_new_tensor(ggml_context * ctx, ggml_type type, int n_dims, const int64_t * ne) {
|
||||
ggml_tensor * t = ::ggml_new_tensor(ctx, type, n_dims, ne);
|
||||
add_sentinel(ctx);
|
||||
return t;
|
||||
}
|
||||
|
||||
ggml_tensor * ggml_new_tensor_1d(ggml_context * ctx, ggml_type type, int64_t ne0) {
|
||||
ggml_tensor * t = ::ggml_new_tensor_1d(ctx, type, ne0);
|
||||
add_sentinel(ctx);
|
||||
return t;
|
||||
}
|
||||
|
||||
ggml_tensor * ggml_new_tensor_2d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1) {
|
||||
ggml_tensor * t = ::ggml_new_tensor_2d(ctx, type, ne0, ne1);
|
||||
add_sentinel(ctx);
|
||||
return t;
|
||||
}
|
||||
|
||||
ggml_tensor * ggml_new_tensor_3d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) {
|
||||
ggml_tensor * t = ::ggml_new_tensor_3d(ctx, type, ne0, ne1, ne2);
|
||||
add_sentinel(ctx);
|
||||
return t;
|
||||
}
|
||||
|
||||
ggml_tensor * ggml_new_tensor_4d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
||||
ggml_tensor * t = ::ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3);
|
||||
add_sentinel(ctx);
|
||||
return t;
|
||||
}
|
||||
|
||||
bool eval(ggml_backend_t backend1, ggml_backend_t backend2, const char * op_name) {
|
||||
mode = MODE_TEST;
|
||||
|
||||
ggml_init_params params = {
|
||||
/* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
|
||||
/* .mem_base = */ NULL,
|
||||
|
@ -268,15 +333,20 @@ struct test_case {
|
|||
};
|
||||
ggml_context * ctx = ggml_init(params);
|
||||
|
||||
gf = ggml_new_graph(ctx);
|
||||
|
||||
// pre-graph sentinel
|
||||
add_sentinel(ctx);
|
||||
|
||||
ggml_tensor * out = build_graph(ctx);
|
||||
|
||||
if (op_name != nullptr && strcmp(ggml_op_desc(out), op_name) != 0) {
|
||||
//printf(" %s: skipping\n", ggml_op_desc(out));
|
||||
if (op_name != nullptr && op_desc(out) != op_name) {
|
||||
//printf(" %s: skipping\n", op_desc(out).c_str());
|
||||
ggml_free(ctx);
|
||||
return true;
|
||||
}
|
||||
|
||||
printf(" %s(%s): ", ggml_op_desc(out), vars().c_str());
|
||||
printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
|
||||
fflush(stdout);
|
||||
|
||||
// check if backends support op
|
||||
|
@ -288,13 +358,20 @@ struct test_case {
|
|||
}
|
||||
}
|
||||
|
||||
// post-graph sentinel
|
||||
add_sentinel(ctx);
|
||||
|
||||
// allocate
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
|
||||
|
||||
// build graph
|
||||
ggml_cgraph * gf = ggml_new_graph(ctx);
|
||||
ggml_build_forward_expand(gf, out);
|
||||
|
||||
// add sentinels as graph nodes so that they are checked in the callback
|
||||
for (ggml_tensor * sentinel : sentinels) {
|
||||
gf->nodes[gf->n_nodes++] = sentinel;
|
||||
}
|
||||
|
||||
// randomize tensors
|
||||
initialize_tensors(ctx);
|
||||
|
||||
|
@ -310,14 +387,29 @@ struct test_case {
|
|||
};
|
||||
|
||||
auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
|
||||
callback_userdata * ud = (callback_userdata *) user_data;
|
||||
|
||||
if (t1->op == GGML_OP_NONE) {
|
||||
// sentinels must be unchanged
|
||||
std::vector<uint8_t> t1_data(ggml_nbytes(t1));
|
||||
std::vector<uint8_t> t2_data(ggml_nbytes(t2));
|
||||
ggml_backend_tensor_get(t1, t1_data.data(), 0, ggml_nbytes(t1));
|
||||
ggml_backend_tensor_get(t2, t2_data.data(), 0, ggml_nbytes(t2));
|
||||
|
||||
if (memcmp(t1_data.data(), t2_data.data(), ggml_nbytes(t1)) != 0) {
|
||||
printf("sentinel mismatch: %s ", t1->name);
|
||||
ud->ok = false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<float> f1 = tensor_to_float(t1);
|
||||
std::vector<float> f2 = tensor_to_float(t2);
|
||||
callback_userdata * ud = (callback_userdata *) user_data;
|
||||
|
||||
for (size_t i = 0; i < f1.size(); i++) {
|
||||
// check for nans
|
||||
if (std::isnan(f1[i]) || std::isnan(f2[i])) {
|
||||
printf("NaN at index %zu ", i);
|
||||
printf("[%s] NaN at index %zu (%f %f) ", ggml_op_desc(t1), i, f1[i], f2[i]);
|
||||
ud->ok = false;
|
||||
return true;
|
||||
}
|
||||
|
@ -325,12 +417,12 @@ struct test_case {
|
|||
if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) {
|
||||
if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) {
|
||||
if (std::signbit(f1[i]) != std::signbit(f2[i])) {
|
||||
printf("inf sign mismatch: %f %f ", f1[i], f2[i]);
|
||||
printf("[%s] inf sign mismatch: %f %f ", ggml_op_desc(t1), f1[i], f2[i]);
|
||||
ud->ok = false;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
printf("inf mismatch: %f %f ", f1[i], f2[i]);
|
||||
printf("[%s] inf mismatch: %f %f ", ggml_op_desc(t1), f1[i], f2[i]);
|
||||
ud->ok = false;
|
||||
return true;
|
||||
}
|
||||
|
@ -339,10 +431,17 @@ struct test_case {
|
|||
|
||||
double err = nmse(f1.data(), f2.data(), f1.size());
|
||||
if (err > ud->max_err) {
|
||||
printf("NMSE = %f ", err);
|
||||
printf("[%s] NMSE = %f ", ggml_op_desc(t1), err);
|
||||
//for (int i = 0; i < f1.size(); i++) {
|
||||
// printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
|
||||
//}
|
||||
//printf("\n");
|
||||
//exit(1);
|
||||
ud->ok = false;
|
||||
}
|
||||
return true;
|
||||
|
||||
GGML_UNUSED(index);
|
||||
};
|
||||
|
||||
ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
|
||||
|
@ -361,6 +460,8 @@ struct test_case {
|
|||
}
|
||||
|
||||
bool eval_perf(ggml_backend_t backend, const char * op_name) {
|
||||
mode = MODE_PERF;
|
||||
|
||||
static const size_t graph_nodes = 8192;
|
||||
|
||||
ggml_init_params params = {
|
||||
|
@ -372,13 +473,13 @@ struct test_case {
|
|||
|
||||
ggml_tensor * out = build_graph(ctx);
|
||||
|
||||
if (op_name != nullptr && strcmp(ggml_op_desc(out), op_name) != 0) {
|
||||
//printf(" %s: skipping\n", ggml_op_desc(out));
|
||||
if (op_name != nullptr && op_desc(out) != op_name) {
|
||||
//printf(" %s: skipping\n", op_desc(out).c_str());
|
||||
ggml_free(ctx);
|
||||
return true;
|
||||
}
|
||||
|
||||
int len = printf(" %s(%s): ", ggml_op_desc(out), vars().c_str());
|
||||
int len = printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
|
||||
fflush(stdout);
|
||||
|
||||
// check if backends support op
|
||||
|
@ -430,8 +531,9 @@ struct test_case {
|
|||
return size;
|
||||
};
|
||||
for (int i = 0; i < gf->n_nodes; i++) {
|
||||
if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out)
|
||||
if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
|
||||
continue;
|
||||
}
|
||||
mem += tensor_op_size(gf->nodes[i]);
|
||||
}
|
||||
|
||||
|
@ -486,17 +588,22 @@ struct test_get_rows : public test_case {
|
|||
const int n; // cols
|
||||
const int m; // rows
|
||||
const int r; // rows to get
|
||||
const int b; // batch size
|
||||
const bool v; // view (non-contiguous src1)
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR4(type, n, m, r);
|
||||
return VARS_TO_STR6(type, n, m, r, b, v);
|
||||
}
|
||||
|
||||
test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3)
|
||||
: type(type), n(n), m(m), r(r) {}
|
||||
test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false)
|
||||
: type(type), n(n), m(m), r(r), b(b), v(v) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * in = ggml_new_tensor_2d(ctx, type, n, m);
|
||||
ggml_tensor * rows = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, r);
|
||||
ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b);
|
||||
ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
|
||||
if (v) {
|
||||
rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
|
||||
}
|
||||
ggml_tensor * out = ggml_get_rows(ctx, in, rows);
|
||||
return out;
|
||||
}
|
||||
|
@ -504,12 +611,13 @@ struct test_get_rows : public test_case {
|
|||
void initialize_tensors(ggml_context * ctx) override {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (t->type == GGML_TYPE_I32) {
|
||||
if (ggml_is_view_op(t->op)) { continue; }
|
||||
// rows
|
||||
std::vector<int> data(r);
|
||||
for (int i = 0; i < r; i++) {
|
||||
std::vector<int> data(r*b);
|
||||
for (int i = 0; i < r*b; i++) {
|
||||
data[i] = rand() % m;
|
||||
}
|
||||
ggml_backend_tensor_set(t, data.data(), 0, r * sizeof(int));
|
||||
ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int));
|
||||
} else {
|
||||
init_tensor_uniform(t);
|
||||
}
|
||||
|
@ -770,11 +878,10 @@ struct test_mul_mat_id : public test_case {
|
|||
const int64_t m;
|
||||
const int64_t n;
|
||||
const int64_t k;
|
||||
const std::array<int64_t, 2> bs; // dims 3 and 4
|
||||
const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
|
||||
const bool v; // view (non-contiguous ids)
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR9(type_a, type_b, n_mats, id, m, n, k, bs, nr);
|
||||
return VARS_TO_STR8(type_a, type_b, n_mats, id, m, n, k, v);
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
|
@ -782,7 +889,7 @@ struct test_mul_mat_id : public test_case {
|
|||
}
|
||||
|
||||
size_t op_size(ggml_tensor * t) override {
|
||||
size_t a = ggml_nbytes(t->src[2]) * n * nr[0] * nr[1];
|
||||
size_t a = ggml_nbytes(t->src[2]) * n;
|
||||
size_t b = ggml_nbytes(t->src[1]) * m;
|
||||
size_t c = ggml_nbytes(t);
|
||||
return a + b + c;
|
||||
|
@ -792,35 +899,41 @@ struct test_mul_mat_id : public test_case {
|
|||
|
||||
test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
|
||||
int n_mats = 2, int id = 0,
|
||||
int64_t m = 32, int64_t n = 32, int64_t k = 32,
|
||||
std::array<int64_t, 2> bs = {10, 10},
|
||||
std::array<int64_t, 2> nr = {2, 2})
|
||||
int64_t m = 32, int64_t n = 32, int64_t k = 32, bool v = false)
|
||||
: type_a(type_a), type_b(type_b), n_mats(n_mats), id(id),
|
||||
m(m), n(n), k(k), bs(bs), nr(nr) {}
|
||||
m(m), n(n), k(k), v(v) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
|
||||
std::vector<ggml_tensor *> mats;
|
||||
for (int i = 0; i < n_mats; i++) {
|
||||
ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]);
|
||||
ggml_tensor * a = ggml_new_tensor_2d(ctx, type_a, k, m);
|
||||
mats.push_back(a);
|
||||
}
|
||||
ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_mats);
|
||||
ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
|
||||
ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), ids, id, b);
|
||||
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
|
||||
if (v) {
|
||||
ids = ggml_view_2d(ctx, ids, n_mats/2, ids->ne[1], ids->nb[1], 0);
|
||||
}
|
||||
ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n);
|
||||
ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), n_mats, ids, v ? id/2 : id, b);
|
||||
return out;
|
||||
}
|
||||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
std::random_device rd;
|
||||
std::default_random_engine rng(rd());
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (t->type == GGML_TYPE_I32) {
|
||||
if (ggml_is_view_op(t->op)) { continue; }
|
||||
// ids
|
||||
std::vector<int> data(n_mats);
|
||||
for (int i = 0; i < n_mats; i++) {
|
||||
data[i] = i;
|
||||
for (int64_t r = 0; r < ggml_nrows(t); r++) {
|
||||
std::vector<int32_t> data(t->ne[0]);
|
||||
for (int i = 0; i < t->ne[0]; i++) {
|
||||
data[i] = i % n_mats;
|
||||
}
|
||||
std::shuffle(data.begin(), data.end(), rng);
|
||||
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
|
||||
}
|
||||
std::shuffle(data.begin(), data.end(), std::default_random_engine(std::random_device()()));
|
||||
ggml_backend_tensor_set(t, data.data(), 0, n_mats * sizeof(int));
|
||||
} else {
|
||||
init_tensor_uniform(t);
|
||||
}
|
||||
|
@ -1109,22 +1222,227 @@ struct test_sum_rows : public test_case {
|
|||
}
|
||||
};
|
||||
|
||||
enum test_mode {
|
||||
MODE_TEST,
|
||||
MODE_PERF,
|
||||
// GGML_OP_UPSCALE
|
||||
struct test_upscale : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const int32_t scale_factor;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR3(type, ne, scale_factor);
|
||||
}
|
||||
|
||||
test_upscale(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {512, 512, 3, 1},
|
||||
int32_t scale_factor = 2)
|
||||
: type(type), ne(ne), scale_factor(scale_factor) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_GROUP_NORM
|
||||
struct test_group_norm : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const int32_t num_groups;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR3(type, ne, num_groups);
|
||||
}
|
||||
|
||||
test_group_norm(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {64, 64, 320, 1},
|
||||
int32_t num_groups = 32)
|
||||
: type(type), ne(ne), num_groups(num_groups) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * out = ggml_group_norm(ctx, a, num_groups);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_ACC
|
||||
struct test_acc : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne_a;
|
||||
const std::array<int64_t, 4> ne_b;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR3(type, ne_a, ne_b);
|
||||
}
|
||||
|
||||
test_acc(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne_a = {1024, 577, 1, 1},
|
||||
std::array<int64_t, 4> ne_b = {1024, 576, 1, 1})
|
||||
: type(type), ne_a(ne_a), ne_b(ne_b) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
||||
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
|
||||
ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_PAD
|
||||
struct test_pad : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne_a;
|
||||
const int pad_0;
|
||||
const int pad_1;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
|
||||
}
|
||||
|
||||
test_pad(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
|
||||
int pad_0 = 1, int pad_1 = 1)
|
||||
: type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
||||
ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_LEAKY_RELU
|
||||
struct test_leaky_relu : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne_a;
|
||||
const float negative_slope;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR3(type, ne_a, negative_slope);
|
||||
}
|
||||
|
||||
test_leaky_relu(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
|
||||
float negative_slope = 0.1f)
|
||||
: type(type), ne_a(ne_a), negative_slope(negative_slope) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
||||
ggml_tensor * out = ggml_leaky_relu(ctx, a, negative_slope, true);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// Mixtral MOE
|
||||
struct test_moe : public test_case {
|
||||
const int n_experts;
|
||||
const int n_experts_per_tok;
|
||||
const int n_tokens;
|
||||
const int n_embd;
|
||||
const int n_ff;
|
||||
|
||||
std::string op_desc(ggml_tensor * t) override {
|
||||
return "MOE";
|
||||
|
||||
GGML_UNUSED(t);
|
||||
}
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR5(n_experts, n_experts_per_tok, n_tokens, n_embd, n_ff);
|
||||
}
|
||||
|
||||
test_moe(int n_experts = 8, int n_experts_per_tok = 2, int n_tokens = 1, int n_embd = 4096, int n_ff = 14336)
|
||||
: n_experts(n_experts), n_experts_per_tok(n_experts_per_tok), n_tokens(n_tokens), n_embd(n_embd), n_ff(n_ff) {
|
||||
}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * ffn_gate_inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_experts);
|
||||
|
||||
std::vector<ggml_tensor *> ffn_up_exp(n_experts);
|
||||
std::vector<ggml_tensor *> ffn_gate_exp(n_experts);
|
||||
std::vector<ggml_tensor *> ffn_down_exp(n_experts);
|
||||
|
||||
for (int i = 0; i < n_experts; ++i) {
|
||||
ffn_up_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
|
||||
ffn_gate_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
|
||||
ffn_down_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
|
||||
}
|
||||
|
||||
ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
|
||||
|
||||
ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur);
|
||||
ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, 1.0f/sqrtf(n_embd));
|
||||
|
||||
// select experts
|
||||
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok);
|
||||
|
||||
ggml_tensor * weights = ggml_get_rows(ctx,
|
||||
ggml_reshape_3d(ctx, probs, 1, n_experts, n_tokens), selected_experts);
|
||||
|
||||
weights = ggml_reshape_2d(ctx, weights, n_experts_per_tok, n_tokens);
|
||||
|
||||
ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights);
|
||||
|
||||
weights = ggml_div(ctx, weights, weights_sum);
|
||||
|
||||
// compute expert outputs
|
||||
ggml_tensor * moe_out = nullptr;
|
||||
|
||||
for (int i = 0; i < n_experts_per_tok; ++i) {
|
||||
ggml_tensor * cur_expert;
|
||||
|
||||
ggml_tensor * cur_up = ggml_mul_mat_id(ctx, ffn_up_exp.data(), n_experts, selected_experts, i, cur);
|
||||
|
||||
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx, ffn_gate_exp.data(), n_experts, selected_experts, i, cur);
|
||||
|
||||
cur_gate = ggml_silu(ctx, cur_gate);
|
||||
|
||||
cur_expert = ggml_mul(ctx, cur_up, cur_gate);
|
||||
|
||||
cur_expert = ggml_mul_mat_id(ctx, ffn_down_exp.data(), n_experts, selected_experts, i, cur_expert);
|
||||
|
||||
cur_expert = ggml_mul(ctx, cur_expert,
|
||||
ggml_view_2d(ctx, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
||||
|
||||
if (i == 0) {
|
||||
moe_out = cur_expert;
|
||||
} else {
|
||||
moe_out = ggml_add(ctx, moe_out, cur_expert);
|
||||
}
|
||||
}
|
||||
|
||||
cur = moe_out;
|
||||
|
||||
return cur;
|
||||
}
|
||||
};
|
||||
|
||||
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
|
||||
std::vector<std::unique_ptr<test_case>> test_cases;
|
||||
|
||||
const ggml_type all_types[] = {
|
||||
GGML_TYPE_F32, GGML_TYPE_F16,
|
||||
GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
|
||||
GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
|
||||
GGML_TYPE_Q8_0,
|
||||
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
|
||||
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
|
||||
GGML_TYPE_Q6_K
|
||||
};
|
||||
|
||||
// unary ops
|
||||
for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
|
||||
test_cases.emplace_back(new test_unary((ggml_unary_op) op));
|
||||
}
|
||||
|
||||
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||
test_cases.emplace_back(new test_get_rows(type, 10, 5, 3));
|
||||
test_cases.emplace_back(new test_get_rows(type, 16, 5, 3));
|
||||
test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
|
||||
for (ggml_type type : all_types) {
|
||||
for (int b : {1, 7}) {
|
||||
for (bool v : {false, true}) {
|
||||
test_cases.emplace_back(new test_get_rows(type, 256, 5, 4, b, v));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
|
||||
|
@ -1134,7 +1452,11 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 2}));
|
||||
|
||||
test_cases.emplace_back(new test_dup());
|
||||
test_cases.emplace_back(new test_cpy());
|
||||
|
||||
for (ggml_type type : all_types) {
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 10, 10, 1}));
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_cont());
|
||||
|
||||
auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
|
||||
|
@ -1144,6 +1466,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
};
|
||||
|
||||
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1});
|
||||
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1});
|
||||
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1});
|
||||
add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 1, 1}, {1, 1, 1, 1});
|
||||
add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 1}, {1, 1, 1, 1});
|
||||
|
@ -1170,8 +1493,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 640, 1}, {32, 32, 1, 1});
|
||||
add_test_bin_bcast(GGML_TYPE_F32, {5120, 1, 1, 1}, {1, 256, 1, 1});
|
||||
add_test_bin_bcast(GGML_TYPE_F32, {640, 1, 1, 1}, {1, 1, 1, 1});
|
||||
add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1});
|
||||
add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1});
|
||||
//add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1});
|
||||
//add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1});
|
||||
|
||||
test_cases.emplace_back(new test_scale());
|
||||
|
||||
|
@ -1180,16 +1503,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
|
||||
}
|
||||
|
||||
const ggml_type all_types[] = {
|
||||
GGML_TYPE_F32, GGML_TYPE_F16,
|
||||
GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
|
||||
GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
|
||||
GGML_TYPE_Q8_0,
|
||||
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
|
||||
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
|
||||
GGML_TYPE_Q6_K
|
||||
};
|
||||
|
||||
for (ggml_type type_a : all_types) {
|
||||
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
|
||||
// FIXME: CPU crashes on f16xf16
|
||||
|
@ -1213,9 +1526,11 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
|
||||
for (ggml_type type_a : all_types) {
|
||||
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
|
||||
for (int n_mats : {1, 2, 4}) {
|
||||
for (int n_mats : {2, 4, 8}) {
|
||||
for (int id = 0; id < n_mats; id++) {
|
||||
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256, {1, 1}, {1, 1}));
|
||||
for (bool v : {false, true}) {
|
||||
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256, v));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1247,10 +1562,22 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
test_cases.emplace_back(new test_concat());
|
||||
|
||||
for (ggml_sort_order order : {GGML_SORT_ASC, GGML_SORT_DESC}) {
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_sum_rows());
|
||||
test_cases.emplace_back(new test_upscale());
|
||||
test_cases.emplace_back(new test_group_norm());
|
||||
test_cases.emplace_back(new test_acc());
|
||||
test_cases.emplace_back(new test_pad());
|
||||
test_cases.emplace_back(new test_leaky_relu());
|
||||
|
||||
#if !defined(__SANITIZE_THREAD__)
|
||||
// FIXME: these tests use too much memory with thread sanitizer
|
||||
test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
|
||||
//test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
|
||||
#endif
|
||||
|
||||
// run tests
|
||||
if (mode == MODE_TEST) {
|
||||
|
@ -1267,14 +1594,17 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
ggml_backend_free(backend_cpu);
|
||||
|
||||
return n_ok == test_cases.size();
|
||||
} else if (mode == MODE_PERF) {
|
||||
}
|
||||
|
||||
if (mode == MODE_PERF) {
|
||||
for (auto & test : test_cases) {
|
||||
test->eval_perf(backend, op_name);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
GGML_ASSERT(false);
|
||||
return false;
|
||||
}
|
||||
|
||||
static void usage(char ** argv) {
|
||||
|
@ -1347,11 +1677,12 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count());
|
||||
|
||||
if (n_ok != ggml_backend_reg_get_count()) {
|
||||
printf("\033[1;31mFAIL\033[0m\n");
|
||||
return 1;
|
||||
} else {
|
||||
printf("\033[1;32mOK\033[0m\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
printf("\033[1;32mOK\033[0m\n");
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
||||
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cmath>
|
||||
|
|
|
@ -117,7 +117,7 @@ static void usage(char * argv[]) {
|
|||
printf(" --size SIZE set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
|
||||
printf(" -3 use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
|
||||
printf(" -4 use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
|
||||
printf(" --op OP set test opration as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
|
||||
printf(" --op OP set test operation as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
|
||||
printf(" quantize_row_q_dot, vec_dot_q (all)\n");
|
||||
printf(" --type TYPE set test type as");
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
|
@ -202,7 +202,7 @@ int main(int argc, char * argv[]) {
|
|||
}
|
||||
int alignment = std::stoi(argv[i]);
|
||||
if (alignment < 0 || alignment > MAX_ALIGNMENT) {
|
||||
fprintf(stderr, "error: aligment-offset must be less than %d\n", MAX_ALIGNMENT);
|
||||
fprintf(stderr, "error: alignment-offset must be less than %d\n", MAX_ALIGNMENT);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
|
@ -286,7 +286,7 @@ int main(int argc, char * argv[]) {
|
|||
qfns.from_float_reference(test_data1, test_q1, size);
|
||||
return test_q1[0];
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
size_t quantized_size = ggml_row_size(type, size);
|
||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
|
@ -300,7 +300,7 @@ int main(int argc, char * argv[]) {
|
|||
qfns.from_float(test_data1, test_q1, size);
|
||||
return test_q1[0];
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
size_t quantized_size = ggml_row_size(type, size);
|
||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
|
@ -315,7 +315,7 @@ int main(int argc, char * argv[]) {
|
|||
qfns.to_float(test_q1, test_out, size);
|
||||
return test_out[0];
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
size_t quantized_size = ggml_row_size(type, size);
|
||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
|
@ -330,7 +330,7 @@ int main(int argc, char * argv[]) {
|
|||
vdot.from_float(test_data1, test_q1, size);
|
||||
return test_q1[0];
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
size_t quantized_size = ggml_row_size(type, size);
|
||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
|
@ -347,7 +347,7 @@ int main(int argc, char * argv[]) {
|
|||
qfns.vec_dot(size, &result, test_q1, test_q2);
|
||||
return result;
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
size_t quantized_size = ggml_row_size(type, size);
|
||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue