Merge branch 'master' into finetune-lora

# Conflicts:
#	ggml-alloc.c
This commit is contained in:
xaedes 2023-09-04 02:40:44 +02:00
commit 9ea2f7ff58
No known key found for this signature in database
GPG key ID: 30030EDD817EA2B1
42 changed files with 1542 additions and 1033 deletions

View file

@ -18,7 +18,6 @@ on:
env: env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }} BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GGML_NLOOP: 3 GGML_NLOOP: 3
GGML_NITER: 1
GGML_N_THREADS: 1 GGML_N_THREADS: 1
jobs: jobs:

36
.github/workflows/code-coverage.yml vendored Normal file
View file

@ -0,0 +1,36 @@
name: Code Coverage
on: [push, pull_request]
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
jobs:
run:
runs-on: ubuntu-20.04
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential gcc-8 lcov
- name: Build
run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
- name: Run tests
run: CC=gcc-8 make test
- name: Generate coverage report
run: |
make coverage
make lcov-report
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
with:
files: lcov-report/coverage.info

7
.gitignore vendored
View file

@ -6,6 +6,10 @@
*.exe *.exe
*.dll *.dll
*.log *.log
*.gcov
*.gcno
*.gcda
*.dot
.DS_Store .DS_Store
.build/ .build/
.cache/ .cache/
@ -17,6 +21,9 @@
.vs/ .vs/
.vscode/ .vscode/
lcov-report/
gcovr-report/
build*/ build*/
out/ out/
tmp/ tmp/

View file

@ -403,6 +403,7 @@ if (LLAMA_ALL_WARNINGS)
-Wpointer-arith -Wpointer-arith
-Wmissing-prototypes -Wmissing-prototypes
-Werror=implicit-int -Werror=implicit-int
-Wno-unused-function
) )
set(cxx_flags set(cxx_flags
-Wall -Wall
@ -412,6 +413,10 @@ if (LLAMA_ALL_WARNINGS)
-Wno-unused-function -Wno-unused-function
-Wno-multichar -Wno-multichar
) )
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
# g++ only
set(cxx_flags ${cxx_flags} -Wno-format-truncation)
endif()
else() else()
# todo : msvc # todo : msvc
endif() endif()

206
Makefile
View file

@ -4,6 +4,9 @@ BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-tex
# Binaries only useful for tests # Binaries only useful for tests
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1
# Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
default: $(BUILD_TARGETS) default: $(BUILD_TARGETS)
test: test:
@ -23,6 +26,18 @@ test:
all: $(BUILD_TARGETS) $(TEST_TARGETS) all: $(BUILD_TARGETS) $(TEST_TARGETS)
coverage: ## Run code coverage
gcov -pb tests/*.cpp
lcov-report: coverage ## Generate lcov report
mkdir -p lcov-report
lcov --capture --directory . --output-file lcov-report/coverage.info
genhtml lcov-report/coverage.info --output-directory lcov-report
gcovr-report: coverage ## Generate gcovr report
mkdir -p gcovr-report
gcovr --root . --html --html-details --output gcovr-report/coverage.html
ifndef UNAME_S ifndef UNAME_S
UNAME_S := $(shell uname -s) UNAME_S := $(shell uname -s)
endif endif
@ -35,6 +50,11 @@ ifndef UNAME_M
UNAME_M := $(shell uname -m) UNAME_M := $(shell uname -m)
endif endif
ifdef RISCV_CROSS_COMPILE
CC := riscv64-unknown-linux-gnu-gcc
CXX := riscv64-unknown-linux-gnu-g++
endif
CCV := $(shell $(CC) --version | head -n 1) CCV := $(shell $(CC) --version | head -n 1)
CXXV := $(shell $(CXX) --version | head -n 1) CXXV := $(shell $(CXX) --version | head -n 1)
@ -62,53 +82,48 @@ OPT = -Ofast
else else
OPT = -O3 OPT = -O3
endif endif
CFLAGS = -I. $(OPT) -std=c11 -fPIC MK_CPPFLAGS = -I. -Icommon
CXXFLAGS = -I. -I./common $(OPT) -std=c++11 -fPIC MK_CFLAGS = $(CPPFLAGS) $(OPT) -std=c11 -fPIC
LDFLAGS = MK_CXXFLAGS = $(CPPFLAGS) $(OPT) -std=c++11 -fPIC
MK_LDFLAGS =
ifdef LLAMA_DEBUG ifdef LLAMA_DEBUG
CFLAGS += -O0 -g MK_CFLAGS += -O0 -g
CXXFLAGS += -O0 -g MK_CXXFLAGS += -O0 -g
LDFLAGS += -g MK_LDFLAGS += -g
else else
CFLAGS += -DNDEBUG MK_CPPFLAGS += -DNDEBUG
CXXFLAGS += -DNDEBUG
endif endif
ifdef LLAMA_SERVER_VERBOSE ifdef LLAMA_SERVER_VERBOSE
CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE) MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
endif endif
ifdef LLAMA_CODE_COVERAGE
CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
endif
ifdef LLAMA_DISABLE_LOGS
CFLAGS += -DLOG_DISABLE_LOGS
CXXFLAGS += -DLOG_DISABLE_LOGS
endif # LLAMA_DISABLE_LOGS
# warnings # warnings
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \ MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
-Wmissing-prototypes -Werror=implicit-int -Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
ifeq '' '$(findstring clang++,$(CXX))'
# g++ only
CXXFLAGS += -Wno-format-truncation
endif
# OS specific # OS specific
# TODO: support Windows # TODO: support Windows
ifeq ($(UNAME_S),Linux) ifneq '' '$(filter $(UNAME_S),Linux Darwin FreeBSD NetBSD OpenBSD Haiku)'
CFLAGS += -pthread MK_CFLAGS += -pthread
CXXFLAGS += -pthread MK_CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),Darwin)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),FreeBSD)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),NetBSD)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),OpenBSD)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),Haiku)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif endif
# detect Windows # detect Windows
@ -134,72 +149,84 @@ ifeq ($(_WIN32),1)
endif endif
ifdef LLAMA_GPROF ifdef LLAMA_GPROF
CFLAGS += -pg MK_CFLAGS += -pg
CXXFLAGS += -pg MK_CXXFLAGS += -pg
endif endif
ifdef LLAMA_PERF ifdef LLAMA_PERF
CFLAGS += -DGGML_PERF MK_CPPFLAGS += -DGGML_PERF
CXXFLAGS += -DGGML_PERF
endif endif
# Architecture specific # Architecture specific
# TODO: probably these flags need to be tweaked on some architectures # TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue # feel free to update the Makefile for your architecture and send a pull request or issue
ifndef RISCV
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
# Use all CPU extensions that are available: # Use all CPU extensions that are available:
CFLAGS += -march=native -mtune=native MK_CFLAGS += -march=native -mtune=native
CXXFLAGS += -march=native -mtune=native MK_CXXFLAGS += -march=native -mtune=native
# Usage AVX-only # Usage AVX-only
#CFLAGS += -mfma -mf16c -mavx #MK_CFLAGS += -mfma -mf16c -mavx
#CXXFLAGS += -mfma -mf16c -mavx #MK_CXXFLAGS += -mfma -mf16c -mavx
# Usage SSSE3-only (Not is SSE3!) # Usage SSSE3-only (Not is SSE3!)
#CFLAGS += -mssse3 #MK_CFLAGS += -mssse3
#CXXFLAGS += -mssse3 #MK_CXXFLAGS += -mssse3
endif
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
# https://github.com/ggerganov/llama.cpp/issues/2922
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
CFLAGS += -Xassembler -muse-unaligned-vector-move
CXXFLAGS += -Xassembler -muse-unaligned-vector-move
endif endif
ifneq ($(filter aarch64%,$(UNAME_M)),) ifneq ($(filter aarch64%,$(UNAME_M)),)
# Apple M1, M2, etc. # Apple M1, M2, etc.
# Raspberry Pi 3, 4, Zero 2 (64-bit) # Raspberry Pi 3, 4, Zero 2 (64-bit)
CFLAGS += -mcpu=native MK_CFLAGS += -mcpu=native
CXXFLAGS += -mcpu=native MK_CXXFLAGS += -mcpu=native
endif endif
ifneq ($(filter armv6%,$(UNAME_M)),) ifneq ($(filter armv6%,$(UNAME_M)),)
# Raspberry Pi 1, Zero # Raspberry Pi 1, Zero
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access MK_CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
endif endif
ifneq ($(filter armv7%,$(UNAME_M)),) ifneq ($(filter armv7%,$(UNAME_M)),)
# Raspberry Pi 2 # Raspberry Pi 2
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations MK_CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
endif endif
ifneq ($(filter armv8%,$(UNAME_M)),) ifneq ($(filter armv8%,$(UNAME_M)),)
# Raspberry Pi 3, 4, Zero 2 (32-bit) # Raspberry Pi 3, 4, Zero 2 (32-bit)
CFLAGS += -mfp16-format=ieee -mno-unaligned-access MK_CFLAGS += -mfp16-format=ieee -mno-unaligned-access
MK_CXXFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif endif
ifneq ($(filter ppc64%,$(UNAME_M)),) ifneq ($(filter ppc64%,$(UNAME_M)),)
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
ifneq (,$(findstring POWER9,$(POWER9_M))) ifneq (,$(findstring POWER9,$(POWER9_M)))
CFLAGS += -mcpu=power9 MK_CFLAGS += -mcpu=power9
CXXFLAGS += -mcpu=power9 MK_CXXFLAGS += -mcpu=power9
endif
# Require c++23's std::byteswap for big-endian support.
ifeq ($(UNAME_M),ppc64)
CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
endif endif
endif endif
else
CFLAGS += -march=rv64gcv -mabi=lp64d
CXXFLAGS += -march=rv64gcv -mabi=lp64d
endif
ifndef LLAMA_NO_K_QUANTS ifndef LLAMA_NO_K_QUANTS
CFLAGS += -DGGML_USE_K_QUANTS MK_CPPFLAGS += -DGGML_USE_K_QUANTS
CXXFLAGS += -DGGML_USE_K_QUANTS
OBJS += k_quants.o OBJS += k_quants.o
ifdef LLAMA_QKK_64 ifdef LLAMA_QKK_64
CFLAGS += -DGGML_QKK_64 MK_CPPFLAGS += -DGGML_QKK_64
CXXFLAGS += -DGGML_QKK_64
endif endif
endif endif
@ -207,31 +234,32 @@ ifndef LLAMA_NO_ACCELERATE
# Mac M1 - include Accelerate framework. # Mac M1 - include Accelerate framework.
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time). # `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
CFLAGS += -DGGML_USE_ACCELERATE MK_CPPFLAGS += -DGGML_USE_ACCELERATE
LDFLAGS += -framework Accelerate MK_LDFLAGS += -framework Accelerate
endif endif
endif # LLAMA_NO_ACCELERATE endif # LLAMA_NO_ACCELERATE
ifdef LLAMA_MPI ifdef LLAMA_MPI
CFLAGS += -DGGML_USE_MPI -Wno-cast-qual MK_CPPFLAGS += -DGGML_USE_MPI
CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual MK_CFLAGS += -Wno-cast-qual
MK_CXXFLAGS += -Wno-cast-qual
OBJS += ggml-mpi.o OBJS += ggml-mpi.o
endif # LLAMA_MPI endif # LLAMA_MPI
ifdef LLAMA_OPENBLAS ifdef LLAMA_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags openblas) MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
LDFLAGS += $(shell pkg-config --libs openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
MK_LDFLAGS += $(shell pkg-config --libs openblas)
endif # LLAMA_OPENBLAS endif # LLAMA_OPENBLAS
ifdef LLAMA_BLIS ifdef LLAMA_BLIS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
LDFLAGS += -lblis -L/usr/local/lib MK_LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS endif # LLAMA_BLIS
ifdef LLAMA_CUBLAS ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
OBJS += ggml-cuda.o OBJS += ggml-cuda.o
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
ifdef LLAMA_CUDA_NVCC ifdef LLAMA_CUDA_NVCC
@ -282,14 +310,15 @@ endif # LLAMA_CUBLAS
ifdef LLAMA_CLBLAST ifdef LLAMA_CLBLAST
CFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL) MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
CXXFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL) MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
# Mac provides OpenCL as a framework # Mac provides OpenCL as a framework
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
LDFLAGS += -lclblast -framework OpenCL MK_LDFLAGS += -lclblast -framework OpenCL
else else
LDFLAGS += $(shell pkg-config --libs clblast OpenCL) MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
endif endif
OBJS += ggml-opencl.o OBJS += ggml-opencl.o
@ -304,10 +333,9 @@ ifdef LLAMA_HIPBLAS
LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_DMMV_X ?= 32
LLAMA_CUDA_MMV_Y ?= 1 LLAMA_CUDA_MMV_Y ?= 1
LLAMA_CUDA_KQUANTS_ITER ?= 2 LLAMA_CUDA_KQUANTS_ITER ?= 2
CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
LDFLAGS += -lhipblas -lamdhip64 -lrocblas
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
@ -322,10 +350,9 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
endif # LLAMA_HIPBLAS endif # LLAMA_HIPBLAS
ifdef LLAMA_METAL ifdef LLAMA_METAL
CFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG MK_CPPFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG
CXXFLAGS += -DGGML_USE_METAL MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit OBJS += ggml-metal.o
OBJS += ggml-metal.o
endif # LLAMA_METAL endif # LLAMA_METAL
ifdef LLAMA_METAL ifdef LLAMA_METAL
@ -338,15 +365,16 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI endif # LLAMA_MPI
ifdef LLAMA_NO_K_QUANTS ifndef LLAMA_NO_K_QUANTS
k_quants.o: k_quants.c k_quants.h k_quants.o: k_quants.c k_quants.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_NO_K_QUANTS endif # LLAMA_NO_K_QUANTS
ifdef LLAMA_DISABLE_LOGS # combine build flags with cmdline overrides
CFLAGS += -DLOG_DISABLE_LOGS override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
CXXFLAGS += -DLOG_DISABLE_LOGS override CFLAGS := $(MK_CFLAGS) $(CFLAGS)
endif # LLAMA_DISABLE_LOGS override CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
# #
# Print build information # Print build information
@ -391,7 +419,7 @@ libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean: clean:
rm -vf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h $(BUILD_TARGETS) $(TEST_TARGETS) rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
# #
# Examples # Examples

View file

@ -12,9 +12,18 @@ let package = Package(
name: "llama", name: "llama",
path: ".", path: ".",
exclude: ["ggml-metal.metal"], exclude: ["ggml-metal.metal"],
sources: ["ggml.c", "llama.cpp"], sources: [
"ggml.c",
"llama.cpp",
"ggml-alloc.c",
"k_quants.c"
],
publicHeadersPath: "spm-headers", publicHeadersPath: "spm-headers",
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")], cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32"]),
.define("GGML_USE_K_QUANTS"),
.define("GGML_USE_ACCELERATE")
],
linkerSettings: [ linkerSettings: [
.linkedFramework("Accelerate") .linkedFramework("Accelerate")
] ]

View file

@ -114,11 +114,13 @@ as the main playground for developing new features for the [ggml](https://github
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj) - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn) - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
**UI:** **UI:**
- [nat/openplayground](https://github.com/nat/openplayground) - [nat/openplayground](https://github.com/nat/openplayground)
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
- [withcatai/catai](https://github.com/withcatai/catai)
--- ---
@ -463,6 +465,8 @@ Building the program with BLAS support may lead to some performance improvements
You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK). You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
- For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed. - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
- For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
- <details> - <details>
<summary>Installing the OpenCL SDK from source</summary> <summary>Installing the OpenCL SDK from source</summary>
@ -480,10 +484,27 @@ Building the program with BLAS support may lead to some performance improvements
``` ```
</details> </details>
Installing CLBlast: it may be found in your operating system's packages. ##### Installing CLBlast
Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
Alternatively, they may be built from source.
- <details> - <details>
<summary>If not, then installing from source:</summary> <summary>Windows:</summary>
```cmd
set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
git clone https://github.com/CNugteren/CLBlast.git
mkdir CLBlast\build
cd CLBlast\build
cmake .. -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
cmake --build . --config Release
cmake --install . --prefix C:/CLBlast
```
- <details>
<summary>Unix:</summary>
```sh ```sh
git clone https://github.com/CNugteren/CLBlast.git git clone https://github.com/CNugteren/CLBlast.git
@ -497,21 +518,32 @@ Building the program with BLAS support may lead to some performance improvements
Where `/some/path` is where the built library will be installed (default is `/usr/local`). Where `/some/path` is where the built library will be installed (default is `/usr/local`).
</details> </details>
Building: ##### Building Llama with CLBlast
- Build with make: - Build with make:
```sh ```sh
make LLAMA_CLBLAST=1 make LLAMA_CLBLAST=1
``` ```
- CMake: - CMake (Unix):
```sh ```sh
mkdir build mkdir build
cd build cd build
cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path
cmake --build . --config Release cmake --build . --config Release
``` ```
- CMake (Windows):
```cmd
set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
mkdir build
cd build
cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
cmake --build . --config Release
cmake --install . --prefix C:/LlamaCPP
```
Running: ##### Running Llama with CLBlast
The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does. The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.

14
codecov.yml Normal file
View file

@ -0,0 +1,14 @@
comment: off
coverage:
status:
project:
default:
target: auto
threshold: 0
base: auto
patch:
default:
target: auto
threshold: 0
base: auto

View file

@ -24,7 +24,9 @@
#if defined(_WIN32) #if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN
#define NOMINMAX #ifndef NOMINMAX
# define NOMINMAX
#endif
#include <codecvt> #include <codecvt>
#include <locale> #include <locale>
#include <windows.h> #include <windows.h>
@ -303,6 +305,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.n_keep = std::stoi(argv[i]); params.n_keep = std::stoi(argv[i]);
} else if (arg == "--draft") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_draft = std::stoi(argv[i]);
} else if (arg == "--chunks") { } else if (arg == "--chunks") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -315,6 +323,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.model = argv[i]; params.model = argv[i];
} else if (arg == "-md" || arg == "--model-draft") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.model_draft = argv[i];
} else if (arg == "-a" || arg == "--alias") { } else if (arg == "-a" || arg == "--alias") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -648,6 +662,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stdout, " --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); fprintf(stdout, " --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
fprintf(stdout, " --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); fprintf(stdout, " --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
fprintf(stdout, " --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); fprintf(stdout, " --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
fprintf(stdout, " --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
if (llama_mlock_supported()) { if (llama_mlock_supported()) {
fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
@ -680,6 +695,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stdout, " -m FNAME, --model FNAME\n"); fprintf(stdout, " -m FNAME, --model FNAME\n");
fprintf(stdout, " model path (default: %s)\n", params.model.c_str()); fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
fprintf(stdout, " -md FNAME, --model-draft FNAME\n");
fprintf(stdout, " draft model for speculative decoding (default: %s)\n", params.model.c_str());
fprintf(stdout, " -ld LOGDIR, --logdir LOGDIR\n"); fprintf(stdout, " -ld LOGDIR, --logdir LOGDIR\n");
fprintf(stdout, " path under which to save YAML logs (no logging if unset)\n"); fprintf(stdout, " path under which to save YAML logs (no logging if unset)\n");
fprintf(stdout, "\n"); fprintf(stdout, "\n");
@ -768,6 +785,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
params.logit_bias[llama_token_eos(lctx)] = -INFINITY; params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
} }
{
LOG("warming up the model with an empty run\n");
const std::vector<llama_token> tmp = { llama_token_bos(lctx), };
llama_eval(lctx, tmp.data(), tmp.size(), 0, params.n_threads);
llama_reset_timings(lctx);
}
return std::make_tuple(model, lctx); return std::make_tuple(model, lctx);
} }
@ -840,6 +865,130 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
return result; return result;
} }
//
// Sampling utils
//
llama_token llama_sample_token(
struct llama_context * ctx,
struct llama_context * ctx_guidance,
struct llama_grammar * grammar,
const struct gpt_params & params,
const std::vector<llama_token> & last_tokens,
std::vector<llama_token_data> & candidates,
int idx) {
const int n_ctx = llama_n_ctx(ctx);
const int n_vocab = llama_n_vocab(ctx);
const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
const float top_p = params.top_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
const float repeat_penalty = params.repeat_penalty;
const float alpha_presence = params.presence_penalty;
const float alpha_frequency = params.frequency_penalty;
const int mirostat = params.mirostat;
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
const bool penalize_nl = params.penalize_nl;
llama_token id = 0;
float * logits = llama_get_logits(ctx) + idx * n_vocab;
// Apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
candidates.clear();
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
if (ctx_guidance) {
llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
}
// apply penalties
if (!last_tokens.empty()) {
const float nl_logit = logits[llama_token_nl(ctx)];
const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
llama_sample_repetition_penalty(ctx, &cur_p,
last_tokens.data() + last_tokens.size() - last_n_repeat,
last_n_repeat, repeat_penalty);
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
last_tokens.data() + last_tokens.size() - last_n_repeat,
last_n_repeat, alpha_frequency, alpha_presence);
if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
cur_p.data[idx].logit = nl_logit;
break;
}
}
}
}
if (grammar != NULL) {
llama_sample_grammar(ctx, &cur_p, grammar);
}
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx, &cur_p);
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temperature(ctx, &cur_p, temp);
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temperature(ctx, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
llama_sample_top_k (ctx, &cur_p, top_k, 1);
llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
llama_sample_typical (ctx, &cur_p, typical_p, 1);
llama_sample_top_p (ctx, &cur_p, top_p, 1);
llama_sample_temperature(ctx, &cur_p, temp);
{
const int n_top = 10;
LOG("top %d candidates:\n", n_top);
for (int i = 0; i < n_top; i++) {
const llama_token id = cur_p.data[i].id;
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
}
}
id = llama_sample_token(ctx, &cur_p);
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
}
}
// printf("`%d`", candidates_p.size);
if (grammar != NULL) {
llama_grammar_accept_token(ctx, grammar, id);
}
return id;
}
//
// YAML utils
//
// returns true if successful, false otherwise // returns true if successful, false otherwise
bool create_directory_with_parents(const std::string & path) { bool create_directory_with_parents(const std::string & path) {
#ifdef _WIN32 #ifdef _WIN32
@ -1045,7 +1194,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str()); dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
fprintf(stream, "hellaswag_tasks: %ld # default: 400\n", params.hellaswag_tasks); fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx)); const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY; const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
@ -1091,6 +1240,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta); fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str()); fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false"); fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
fprintf(stream, "n_gpu_layers: %d # default: 0\n", params.n_gpu_layers); fprintf(stream, "n_gpu_layers: %d # default: 0\n", params.n_gpu_layers);

View file

@ -32,6 +32,7 @@ struct gpt_params {
int32_t n_ctx = 512; // context size int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_gpu_layers = 0; // number of layers to store in VRAM int32_t n_gpu_layers = 0; // number of layers to store in VRAM
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
@ -63,6 +64,7 @@ struct gpt_params {
float cfg_scale = 1.f; // How strong is guidance float cfg_scale = 1.f; // How strong is guidance
std::string model = "models/7B/ggml-model-f16.gguf"; // model path std::string model = "models/7B/ggml-model-f16.gguf"; // model path
std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias std::string model_alias = "unknown"; // model alias
std::string prompt = ""; std::string prompt = "";
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
@ -156,6 +158,40 @@ std::string llama_detokenize_bpe(
llama_context * ctx, llama_context * ctx,
const std::vector<llama_token> & tokens); const std::vector<llama_token> & tokens);
//
// Sampling utils
//
// this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function
//
// required:
// - ctx: context to use for sampling
// - params: sampling parameters
//
// optional:
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
// - grammar: grammar to use for sampling, ignore if NULL
// - last_tokens: needed for repetition penalty, ignore if empty
// - idx: sample from llama_get_logits(ctx) + idx * n_vocab
//
// returns:
// - token: sampled token
// - candidates: vector of candidate tokens
//
llama_token llama_sample_token(
struct llama_context * ctx,
struct llama_context * ctx_guidance,
struct llama_grammar * grammar,
const struct gpt_params & params,
const std::vector<llama_token> & last_tokens,
std::vector<llama_token_data> & candidates,
int idx = 0);
//
// YAML utils
//
bool create_directory_with_parents(const std::string & path); bool create_directory_with_parents(const std::string & path);
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data); void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data); void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);

View file

@ -235,6 +235,7 @@ namespace console {
int estimateWidth(char32_t codepoint) { int estimateWidth(char32_t codepoint) {
#if defined(_WIN32) #if defined(_WIN32)
(void)codepoint;
return 1; return 1;
#else #else
return wcwidth(codepoint); return wcwidth(codepoint);

View file

@ -154,7 +154,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
// #include "log.h" // #include "log.h"
// //
#ifndef LOG_NO_TIMESTAMPS #ifndef LOG_NO_TIMESTAMPS
#ifndef _WIN32 #ifndef _MSC_VER
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] " #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count() #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#else #else
@ -167,7 +167,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
#endif #endif
#ifdef LOG_TEE_TIMESTAMPS #ifdef LOG_TEE_TIMESTAMPS
#ifndef _WIN32 #ifndef _MSC_VER
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] " #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count() #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#else #else
@ -187,7 +187,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
// #include "log.h" // #include "log.h"
// //
#ifndef LOG_NO_FILE_LINE_FUNCTION #ifndef LOG_NO_FILE_LINE_FUNCTION
#ifndef _WIN32 #ifndef _MSC_VER
#define LOG_FLF_FMT "[%24s:%5d][%24s] " #define LOG_FLF_FMT "[%24s:%5d][%24s] "
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else #else
@ -200,7 +200,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
#endif #endif
#ifdef LOG_TEE_FILE_LINE_FUNCTION #ifdef LOG_TEE_FILE_LINE_FUNCTION
#ifndef _WIN32 #ifndef _MSC_VER
#define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] " #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else #else
@ -224,7 +224,7 @@ enum LogTriState
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
// USE LOG() INSTEAD // USE LOG() INSTEAD
// //
#ifndef _WIN32 #ifndef _MSC_VER
#define LOG_IMPL(str, ...) \ #define LOG_IMPL(str, ...) \
{ \ { \
if (LOG_TARGET != nullptr) \ if (LOG_TARGET != nullptr) \
@ -247,7 +247,7 @@ enum LogTriState
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
// USE LOG_TEE() INSTEAD // USE LOG_TEE() INSTEAD
// //
#ifndef _WIN32 #ifndef _MSC_VER
#define LOG_TEE_IMPL(str, ...) \ #define LOG_TEE_IMPL(str, ...) \
{ \ { \
if (LOG_TARGET != nullptr) \ if (LOG_TARGET != nullptr) \
@ -284,7 +284,7 @@ enum LogTriState
// Main LOG macro. // Main LOG macro.
// behaves like printf, and supports arguments the exact same way. // behaves like printf, and supports arguments the exact same way.
// //
#ifndef _WIN32 #ifndef _MSC_VER
#define LOG(...) LOG_IMPL(__VA_ARGS__, "") #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
#else #else
#define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "") #define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "")
@ -298,14 +298,14 @@ enum LogTriState
// Secondary target can be changed just like LOG_TARGET // Secondary target can be changed just like LOG_TARGET
// by defining LOG_TEE_TARGET // by defining LOG_TEE_TARGET
// //
#ifndef _WIN32 #ifndef _MSC_VER
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "") #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
#else #else
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "") #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "")
#endif #endif
// LOG macro variants with auto endline. // LOG macro variants with auto endline.
#ifndef _WIN32 #ifndef _MSC_VER
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n") #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n") #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
#else #else
@ -341,14 +341,14 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri
} }
} }
if (_disabled)
{
// Log is disabled
return nullptr;
}
if (_initialized) if (_initialized)
{ {
if (_disabled)
{
// Log is disabled
return nullptr;
}
// with fallback in case something went wrong // with fallback in case something went wrong
return logfile ? logfile : stderr; return logfile ? logfile : stderr;
} }
@ -461,7 +461,7 @@ inline void log_test()
LOG("13 Hello World this time in yet new file?\n") LOG("13 Hello World this time in yet new file?\n")
log_set_target(log_filename_generator("llama_autonamed", "log")); log_set_target(log_filename_generator("llama_autonamed", "log"));
LOG("14 Hello World in log with generated filename!\n") LOG("14 Hello World in log with generated filename!\n")
#ifdef _WIN32 #ifdef _MSC_VER
LOG_TEE("15 Hello msvc TEE without arguments\n") LOG_TEE("15 Hello msvc TEE without arguments\n")
LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test") LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test")
LOG_TEELN("17 Hello msvc TEELN without arguments\n") LOG_TEELN("17 Hello msvc TEELN without arguments\n")

View file

@ -11,11 +11,14 @@ import sys
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
import gguf
import numpy as np import numpy as np
import torch import torch
from transformers import AutoTokenizer # type: ignore[import] from transformers import AutoTokenizer # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
def bytes_to_unicode(): def bytes_to_unicode():
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py

View file

@ -11,11 +11,14 @@ import sys
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
import gguf
import numpy as np import numpy as np
import torch import torch
from transformers import AutoTokenizer # type: ignore[import] from transformers import AutoTokenizer # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py

View file

@ -1,261 +0,0 @@
#!/usr/bin/env python3
# 7b pth llama --> gguf conversion
# Only models with a single datafile are supported, like 7B
# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import TYPE_CHECKING, Any
import gguf
import numpy as np
import torch
from sentencepiece import SentencePieceProcessor # type: ignore[import]
if TYPE_CHECKING:
from typing import TypeAlias
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("consolidated."):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a PyTorch 7B LLaMA model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "LlamaForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit()
# get number of model parts
num_parts = count_model_parts(dir_model)
if num_parts > 1:
print("gguf: Only models with a single datafile are supported.")
sys.exit()
ARCH=gguf.MODEL_ARCH.LLAMA
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["num_hidden_layers"]
head_count = hparams["num_attention_heads"]
if "num_key_value_heads" in hparams:
head_count_kv = hparams["num_key_value_heads"]
else:
head_count_kv = head_count
if "_name_or_path" in hparams:
hf_repo = hparams["_name_or_path"]
else:
hf_repo = ""
if "max_sequence_length" in hparams:
ctx_length = hparams["max_sequence_length"]
elif "max_position_embeddings" in hparams:
ctx_length = hparams["max_position_embeddings"]
else:
print("gguf: can not find ctx length parameter.")
sys.exit()
gguf_writer.add_name(dir_model.name)
gguf_writer.add_source_hf_repo(hf_repo)
gguf_writer.add_tensor_data_layout("Meta AI original pth")
gguf_writer.add_context_length(ctx_length)
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
if "type" in hparams["rope_scaling"]:
if hparams["rope_scaling"]["type"] == "linear":
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytes] = []
scores: list[float] = []
toktypes: list[int] = []
tokenizer_model_file = dir_model / 'tokenizer.model'
if not tokenizer_model_file.is_file():
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
sys.exit(1)
# vocab type sentencepiece
print("gguf: get sentencepiece tokenizer vocab and scores")
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
for i in range(tokenizer.vocab_size()):
text: bytes
score: float
piece = tokenizer.id_to_piece(i)
text = piece.encode("utf-8")
score = tokenizer.get_score(i)
toktype = 1 # defualt to normal token type
if tokenizer.is_unknown(i):
toktype = 2
if tokenizer.is_control(i):
toktype = 3
# toktype = 4 is user-defined = tokens from added_tokens.json
if tokenizer.is_unused(i):
toktype = 5
if tokenizer.is_byte(i):
toktype = 6
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
added_tokens_file = dir_model / 'added_tokens.json'
if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f:
addtokens_json = json.load(f)
print("gguf: get added tokens")
for key in addtokens_json:
tokens.append( key.encode("utf-8") )
scores.append(-1000.0)
toktypes.append(4) # user-defined token type
gguf_writer.add_tokenizer_model("llama")
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# tensor info
print("gguf: get tensor metadata")
part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts))
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
for name in model_part.keys():
data = model_part[name]
# we don't need these
if name == "rope.freqs":
continue
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View file

@ -7,9 +7,13 @@ import struct
import sys import sys
from pathlib import Path from pathlib import Path
import gguf
import numpy as np import numpy as np
import os
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
# Note: Does not support GGML_QKK_64 # Note: Does not support GGML_QKK_64
QK_K = 256 QK_K = 256
# Items here are (block size, type size) # Items here are (block size, type size)

View file

@ -1,280 +0,0 @@
#!/usr/bin/env python3
# HF llama --> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import TYPE_CHECKING, Any
import gguf
import numpy as np
import torch
from sentencepiece import SentencePieceProcessor # type: ignore[import]
if TYPE_CHECKING:
from typing import TypeAlias
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
# reverse HF permute back to original pth layout
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
def count_model_parts(dir_model: str) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "LlamaForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit()
# get number of model parts
num_parts = count_model_parts(dir_model)
ARCH=gguf.MODEL_ARCH.LLAMA
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["num_hidden_layers"]
head_count = hparams["num_attention_heads"]
if "num_key_value_heads" in hparams:
head_count_kv = hparams["num_key_value_heads"]
else:
head_count_kv = head_count
if "_name_or_path" in hparams:
hf_repo = hparams["_name_or_path"]
else:
hf_repo = ""
if "max_sequence_length" in hparams:
ctx_length = hparams["max_sequence_length"]
elif "max_position_embeddings" in hparams:
ctx_length = hparams["max_position_embeddings"]
else:
print("gguf: can not find ctx length parameter.")
sys.exit()
gguf_writer.add_name(dir_model.name)
gguf_writer.add_source_hf_repo(hf_repo)
gguf_writer.add_tensor_data_layout("Meta AI original pth")
gguf_writer.add_context_length(ctx_length)
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
if "type" in hparams["rope_scaling"]:
if hparams["rope_scaling"]["type"] == "linear":
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytes] = []
scores: list[float] = []
toktypes: list[int] = []
tokenizer_model_file = dir_model / 'tokenizer.model'
if not tokenizer_model_file.is_file():
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
sys.exit(1)
# vocab type sentencepiece
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
for i in range(tokenizer.vocab_size()):
text: bytes
score: float
piece = tokenizer.id_to_piece(i)
text = piece.encode("utf-8")
score = tokenizer.get_score(i)
toktype = 1 # defualt to normal token type
if tokenizer.is_unknown(i):
toktype = 2
if tokenizer.is_control(i):
toktype = 3
# toktype = 4 is user-defined = tokens from added_tokens.json
if tokenizer.is_unused(i):
toktype = 5
if tokenizer.is_byte(i):
toktype = 6
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
added_tokens_file = dir_model / 'added_tokens.json'
if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f:
addtokens_json = json.load(f)
print("gguf: get added tokens")
for key in addtokens_json:
tokens.append( key.encode("utf-8") )
scores.append(-1000.0)
toktypes.append(4) # user-defined token type
gguf_writer.add_tokenizer_model("llama")
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
for name in model_part.keys():
data = model_part[name]
# we don't need these
if name.endswith(".rotary_emb.inv_freq"):
continue
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# reverse permute these
if name.endswith(".q_proj.weight"):
data = reverse_hf_permute(data, head_count)
if name.endswith(".k_proj.weight"):
data = reverse_hf_permute(data, head_count, head_count_kv)
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View file

@ -25,10 +25,14 @@ from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar
import gguf
import numpy as np import numpy as np
from sentencepiece import SentencePieceProcessor # type: ignore[import] from sentencepiece import SentencePieceProcessor # type: ignore[import]
import os
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
if TYPE_CHECKING: if TYPE_CHECKING:
from typing import TypeAlias from typing import TypeAlias
@ -319,15 +323,27 @@ class BpeVocab:
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
added_tokens: dict[str, int] added_tokens: dict[str, int]
if fname_added_tokens is not None: if fname_added_tokens is not None:
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else: else:
added_tokens = {} # Fall back to trying to find the added tokens in tokenizer.json
tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
if not tokenizer_json_file.is_file():
added_tokens = {}
else:
tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
added_tokens = dict(
(item['content'], item['id'])
for item in tokenizer_json.get('added_tokens', [])
# Added tokens here can be duplicates of the main vocabulary.
if item['content'] not in self.bpe_tokenizer )
vocab_size: int = len(self.bpe_tokenizer) vocab_size: int = len(self.bpe_tokenizer)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values()) actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids: if expected_ids != actual_ids:
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}") expected_end_id = vocab_size + len(actual_ids) - 1
raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_list = [text for (text, idx) in items] self.added_tokens_list = [text for (text, idx) in items]
@ -341,10 +357,22 @@ class BpeVocab:
from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import] from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import]
byte_encoder = tokenization_gpt2.bytes_to_unicode() byte_encoder = tokenization_gpt2.bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()} byte_decoder = {v: k for k, v in byte_encoder.items()}
score = 0.0
for i, item in enumerate(tokenizer): for i, item in enumerate(tokenizer):
text: bytes = item.encode("utf-8") text: bytes = item.encode("utf-8")
score: float = -i # FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior?
yield text, score, gguf.TokenType.USER_DEFINED if i <= 258 and text.startswith(b'<') and text.endswith(b'>'):
if i == 0 and text == b'<unk>':
toktype = gguf.TokenType.UNKNOWN
elif i == 1 or i == 2:
toktype = gguf.TokenType.CONTROL
elif i >= 3 and text.startswith(b'<0x'):
toktype = gguf.TokenType.BYTE
else:
toktype = gguf.TokenType.NORMAL
else:
toktype = gguf.TokenType.NORMAL
yield text, score, toktype
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list: for text in self.added_tokens_list:
@ -526,7 +554,7 @@ class LazyTensor:
raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.') raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.')
LazyModel = dict[str, LazyTensor] LazyModel: TypeAlias = 'dict[str, LazyTensor]'
@dataclass @dataclass

View file

@ -24,6 +24,7 @@ else()
add_subdirectory(finetune) add_subdirectory(finetune)
add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(simple) add_subdirectory(simple)
add_subdirectory(speculative)
add_subdirectory(embd-input) add_subdirectory(embd-input)
add_subdirectory(llama-bench) add_subdirectory(llama-bench)
add_subdirectory(beam-search) add_subdirectory(beam-search)

View file

@ -1617,15 +1617,10 @@ int main(int argc, char ** argv) {
float error_before_opt = ggml_get_f32_1d(e, 0); float error_before_opt = ggml_get_f32_1d(e, 0);
struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS); struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
opt_params_adam.print_forward_graph = false;
opt_params_adam.print_backward_graph = false;
opt_params_lbfgs.print_forward_graph = false; opt_params_lbfgs.print_forward_graph = false;
opt_params_lbfgs.print_backward_graph = false; opt_params_lbfgs.print_backward_graph = false;
opt_params_adam.adam.n_iter = 16;
opt_params_lbfgs.lbfgs.n_iter = 16; opt_params_lbfgs.lbfgs.n_iter = 16;
// ggml_opt(ctx0, opt_params_adam, e);
ggml_opt(ctx0, opt_params_lbfgs, e); ggml_opt(ctx0, opt_params_lbfgs, e);
// //
ggml_build_forward_expand(&gf, e); ggml_build_forward_expand(&gf, e);

View file

@ -22,7 +22,9 @@
#include <unistd.h> #include <unistd.h>
#elif defined (_WIN32) #elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN
#define NOMINMAX #ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h> #include <windows.h>
#include <signal.h> #include <signal.h>
#endif #endif
@ -73,7 +75,7 @@ void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_stat
assert(0u < beams_state.n_beams); assert(0u < beams_state.n_beams);
const llama_token * tokens = beams_state.beam_views[0].tokens; const llama_token * tokens = beams_state.beam_views[0].tokens;
std::copy(tokens, tokens + n, callback_data.response.end() - n); std::copy(tokens, tokens + n, callback_data.response.end() - n);
printf("%lu", n); printf("%zu", n);
} }
fflush(stdout); fflush(stdout);
#if 1 // DEBUG: print current beams for this iteration #if 1 // DEBUG: print current beams for this iteration
@ -145,7 +147,7 @@ int main(int argc, char ** argv)
if (tokens_list.size() > max_tokens_list_size) if (tokens_list.size() > max_tokens_list_size)
{ {
fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" , fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
__func__ , tokens_list.size() , max_tokens_list_size ); __func__ , tokens_list.size() , max_tokens_list_size );
return 1; return 1;
} }

View file

@ -75,7 +75,7 @@ typedef struct {
int seq_len; // max sequence length int seq_len; // max sequence length
} Config; } Config;
typedef struct { struct TransformerWeights {
// token embedding table // token embedding table
float* token_embedding_table; // (vocab_size, dim) float* token_embedding_table; // (vocab_size, dim)
// weights for rmsnorms // weights for rmsnorms
@ -97,7 +97,22 @@ typedef struct {
// float* freq_cis_imag; // (seq_len, dim/2) // float* freq_cis_imag; // (seq_len, dim/2)
// (optional) classifier weights for the logits, on the last layer // (optional) classifier weights for the logits, on the last layer
float* wcls; float* wcls;
} TransformerWeights;
~TransformerWeights() {
delete[] token_embedding_table;
delete[] rms_att_weight;
delete[] rms_ffn_weight;
delete[] wq;
delete[] wk;
delete[] wv;
delete[] wo;
delete[] w1;
delete[] w2;
delete[] w3;
delete[] rms_final_weight;
delete[] wcls;
}
};
void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
// we calloc instead of malloc to keep valgrind happy // we calloc instead of malloc to keep valgrind happy
@ -173,21 +188,6 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar
return 0; return 0;
} }
void free_weights(TransformerWeights* w) {
delete w->token_embedding_table;
delete w->rms_att_weight;
delete w->rms_ffn_weight;
delete w->wq;
delete w->wk;
delete w->wv;
delete w->wo;
delete w->w1;
delete w->w2;
delete w->w3;
delete w->rms_final_weight;
if (w->wcls) delete w->wcls;
}
void print_sample_weights(TransformerWeights *w){ void print_sample_weights(TransformerWeights *w){
printf("----- Quick print of first of the weight vales of all the variables\n"); printf("----- Quick print of first of the weight vales of all the variables\n");
printf("%f\n", w->token_embedding_table[0]); printf("%f\n", w->token_embedding_table[0]);
@ -596,6 +596,10 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
// assume llama2.c vocabulary // assume llama2.c vocabulary
printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename); printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
llama_file file(filename, "rb"); llama_file file(filename, "rb");
if (!file.fp) {
fprintf(stderr, "error: %s: %s\n", strerror(errno), filename);
exit(1);
}
const int n_vocab = config->vocab_size; const int n_vocab = config->vocab_size;
/* uint32_t max_token_length = */ file.read_u32(); // unused /* uint32_t max_token_length = */ file.read_u32(); // unused
vocab->id_to_token.resize(n_vocab); vocab->id_to_token.resize(n_vocab);
@ -633,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
} }
} }
void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){ void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
int ct; int ct;
switch (gg_weights->n_dims){ switch (gg_weights->n_dims){
case 1: case 1:
@ -670,13 +674,13 @@ void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * kar
} }
void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) { void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
// stuff AK weights into GG weights one by one. // convert AK weights into GG weights one by one.
// w->token_embedding_table -> model->tok_embeddings // w->token_embedding_table -> model->tok_embeddings
// float* -> struct ggml_tensor // float* -> struct ggml_tensor
stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table); convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table); convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight); convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
//print_row(model->norm, 0); //print_row(model->norm, 0);
// for rms-att-weight // for rms-att-weight
@ -686,18 +690,18 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){ for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
auto & layer = model->layers[i]; auto & layer = model->layers[i];
// 1d // 1d
stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]); convert_weights_ak_to_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]); convert_weights_ak_to_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
// from 3d matrix layer x dim x dim to 2d matrix dim x dim // from 3d matrix layer x dim x dim to 2d matrix dim x dim
stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]); convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]);
stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]); convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length]);
stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]); convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length]);
stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]); convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]);
stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]); convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]); convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]); convert_weights_ak_to_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
} }
struct gguf_context * ctx = gguf_init_empty(); struct gguf_context * ctx = gguf_init_empty();
@ -898,7 +902,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
} }
std::string basename(const std::string &path) { std::string basename(const std::string &path) {
size_t pos = path.find_last_of("/"); size_t pos = path.find_last_of("/\\");
if (pos == std::string::npos) { if (pos == std::string::npos) {
return path; return path;
} }
@ -911,7 +915,7 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
Config config; Config config;
TransformerWeights weights; TransformerWeights weights = {};
{ {
FILE *file = fopen(params.fn_llama2c_model, "rb"); FILE *file = fopen(params.fn_llama2c_model, "rb");
if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; } if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
@ -953,6 +957,5 @@ int main(int argc, char ** argv) {
printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model); printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
ggml_free(model.ctx); ggml_free(model.ctx);
free_weights(&weights);
return 0; return 0;
} }

View file

@ -660,9 +660,10 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
ggml_tensor * gpt_neox_ff( ggml_tensor * gpt_neox_ff(
const gpt_neox_block &block, const gpt_neox_block &block,
ggml_context * ctx0, ggml_context * ctx0,
ggml_tensor * inp) { ggml_tensor * inp,
const gpt_neox_hparams &hparams) {
ggml_tensor * cur = ggml_norm(ctx0, inp); ggml_tensor * cur = ggml_norm(ctx0, inp, hparams.norm_eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur)); cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur));
cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur); cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur);
@ -753,7 +754,7 @@ bool gpt_neox_eval(
// self-attention // self-attention
{ {
{ {
cur = ggml_norm(ctx0, inpL); cur = ggml_norm(ctx0, inpL, hparams.norm_eps);
cur = ggml_add(ctx0, cur = ggml_add(ctx0,
ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur), ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur),
@ -844,7 +845,7 @@ bool gpt_neox_eval(
if (hparams.par_res == 0) { if (hparams.par_res == 0) {
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL); struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF); cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF, hparams);
// input for next layer // input for next layer
inpL = ggml_add(ctx0, cur, inpFF); inpL = ggml_add(ctx0, cur, inpFF);
@ -853,7 +854,7 @@ bool gpt_neox_eval(
// this is independent of the self-attention result, so it could be done in parallel to the self-attention // this is independent of the self-attention result, so it could be done in parallel to the self-attention
// note here we pass inpL instead of cur // note here we pass inpL instead of cur
cur = gpt_neox_ff(model.blocks[il], ctx0, inpL); cur = gpt_neox_ff(model.blocks[il], ctx0, inpL, hparams);
// layer input + FF // layer input + FF
cur = ggml_add(ctx0, cur, inpFF); cur = ggml_add(ctx0, cur, inpFF);
@ -867,7 +868,7 @@ bool gpt_neox_eval(
// norm // norm
{ {
inpL = ggml_norm(ctx0, inpL); inpL = ggml_norm(ctx0, inpL, hparams.norm_eps);
// inpL = ln_f_g*inpL + ln_f_b // inpL = ln_f_g*inpL + ln_f_b
inpL = ggml_add(ctx0, inpL = ggml_add(ctx0,

View file

@ -34,7 +34,7 @@ For an interactive experience, try this command:
#### Unix-based systems (Linux, macOS, etc.): #### Unix-based systems (Linux, macOS, etc.):
```bash ```bash
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \ ./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
'User: Hi 'User: Hi
AI: Hello. I am an AI chatbot. Would you like to talk? AI: Hello. I am an AI chatbot. Would you like to talk?
User: Sure! User: Sure!
@ -45,7 +45,7 @@ User:'
#### Windows: #### Windows:
```powershell ```powershell
main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:" main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
``` ```
The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it): The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):

View file

@ -116,7 +116,7 @@ int main(int argc, char ** argv) {
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("main", "log")); log_set_target(log_filename_generator("main", "log"));
LOG_TEE("Log start\n"); LOG_TEE("Log start\n");
log_dump_cmdline(argc,argv); log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS #endif // LOG_DISABLE_LOGS
// TODO: Dump params ? // TODO: Dump params ?
@ -425,8 +425,9 @@ int main(int argc, char ** argv) {
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("\n\n"); LOG_TEE("\n\n");
struct llama_grammar * grammar = NULL;
grammar_parser::parse_state parsed_grammar; grammar_parser::parse_state parsed_grammar;
llama_grammar * grammar = NULL;
if (!params.grammar.empty()) { if (!params.grammar.empty()) {
parsed_grammar = grammar_parser::parse(params.grammar.c_str()); parsed_grammar = grammar_parser::parse(params.grammar.c_str());
// will be empty (default) if there are parse errors // will be empty (default) if there are parse errors
@ -450,8 +451,8 @@ int main(int argc, char ** argv) {
} }
// TODO: replace with ring-buffer // TODO: replace with ring-buffer
std::vector<llama_token> last_n_tokens(n_ctx); std::vector<llama_token> last_tokens(n_ctx);
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); std::fill(last_tokens.begin(), last_tokens.end(), 0);
if (params.interactive) { if (params.interactive) {
const char *control_message; const char *control_message;
@ -492,13 +493,10 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd; std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance; std::vector<llama_token> embd_guidance;
{ const int n_vocab = llama_n_vocab(ctx);
LOG("warming up the model with an empty run\n");
const std::vector<llama_token> tmp = { llama_token_bos(ctx), }; std::vector<llama_token_data> candidates;
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); candidates.reserve(n_vocab);
llama_reset_timings(ctx);
}
while ((n_remain != 0 && !is_antiprompt) || params.interactive) { while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict // predict
@ -537,8 +535,8 @@ int main(int argc, char ** argv) {
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance); LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
// insert n_left/2 tokens at the start of embd from last_n_tokens // insert n_left/2 tokens at the start of embd from last_tokens
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); embd.insert(embd.begin(), last_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_tokens.end() - embd.size());
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd)); LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
@ -637,20 +635,6 @@ int main(int argc, char ** argv) {
embd_guidance.clear(); embd_guidance.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) { if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
const float top_p = params.top_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
const float repeat_penalty = params.repeat_penalty;
const float alpha_presence = params.presence_penalty;
const float alpha_frequency = params.frequency_penalty;
const int mirostat = params.mirostat;
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
const bool penalize_nl = params.penalize_nl;
// optionally save the session on first sample (for faster prompt loading next time) // optionally save the session on first sample (for faster prompt loading next time)
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) { if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
need_to_save_session = false; need_to_save_session = false;
@ -659,98 +643,12 @@ int main(int argc, char ** argv) {
LOG("saved session to %s\n", path_session.c_str()); LOG("saved session to %s\n", path_session.c_str());
} }
llama_token id = 0; const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
{ last_tokens.erase(last_tokens.begin());
auto logits = llama_get_logits(ctx); last_tokens.push_back(id);
auto n_vocab = llama_n_vocab(ctx);
// Apply params.logit_bias map LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
if (ctx_guidance) {
llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
}
// Apply penalties
float nl_logit = logits[llama_token_nl(ctx)];
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
llama_sample_repetition_penalty(ctx, &cur_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
last_n_repeat, repeat_penalty);
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
last_n_repeat, alpha_frequency, alpha_presence);
if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
cur_p.data[idx].logit = nl_logit;
break;
}
}
}
if (grammar != NULL) {
llama_sample_grammar(ctx, &cur_p, grammar);
}
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx, &cur_p);
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temperature(ctx, &cur_p, temp);
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temperature(ctx, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
llama_sample_top_k (ctx, &cur_p, top_k, 1);
llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
llama_sample_typical (ctx, &cur_p, typical_p, 1);
llama_sample_top_p (ctx, &cur_p, top_p, 1);
llama_sample_temperature(ctx, &cur_p, temp);
{
const int n_top = 10;
LOG("top %d candidates:\n", n_top);
for (int i = 0; i < n_top; i++) {
const llama_token id = cur_p.data[i].id;
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
}
}
id = llama_sample_token(ctx, &cur_p);
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
}
}
// printf("`%d`", candidates_p.size);
if (grammar != NULL) {
llama_grammar_accept_token(ctx, grammar, id);
}
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_n_tokens));
}
embd.push_back(id); embd.push_back(id);
@ -766,8 +664,8 @@ int main(int argc, char ** argv) {
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) { while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]); embd.push_back(embd_inp[n_consumed]);
last_n_tokens.erase(last_n_tokens.begin()); last_tokens.erase(last_tokens.begin());
last_n_tokens.push_back(embd_inp[n_consumed]); last_tokens.push_back(embd_inp[n_consumed]);
++n_consumed; ++n_consumed;
if ((int) embd.size() >= params.n_batch) { if ((int) embd.size() >= params.n_batch) {
break; break;
@ -800,7 +698,7 @@ int main(int argc, char ** argv) {
// check for reverse prompt // check for reverse prompt
if (params.antiprompt.size()) { if (params.antiprompt.size()) {
std::string last_output; std::string last_output;
for (auto id : last_n_tokens) { for (auto id : last_tokens) {
last_output += llama_token_to_piece(ctx, id); last_output += llama_token_to_piece(ctx, id);
} }
@ -831,7 +729,7 @@ int main(int argc, char ** argv) {
} }
// deal with end of text token in interactive mode // deal with end of text token in interactive mode
if (last_n_tokens.back() == llama_token_eos(ctx)) { if (last_tokens.back() == llama_token_eos(ctx)) {
LOG("found EOS token\n"); LOG("found EOS token\n");
if (params.interactive) { if (params.interactive) {
@ -933,7 +831,7 @@ int main(int argc, char ** argv) {
if (grammar != NULL) { if (grammar != NULL) {
llama_grammar_free(grammar); llama_grammar_free(grammar);
std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules()); std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar = llama_grammar_init( grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(), grammar_rules.data(), grammar_rules.size(),
parsed_grammar.symbol_ids.at("root")); parsed_grammar.symbol_ids.at("root"));

View file

@ -35,6 +35,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
}; };
@ -71,12 +73,17 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
// //
void usage(const char * executable) { void usage(const char * executable) {
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
fprintf(stderr, "\nAllowed quantization types:\n"); printf("\nAllowed quantization types:\n");
for (auto & it : QUANT_OPTIONS) { for (auto & it : QUANT_OPTIONS) {
printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str()); if (it.name != "COPY") {
printf(" %2d or ", it.ftype);
} else {
printf(" ");
}
printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str());
} }
exit(1); exit(1);
} }
@ -121,6 +128,9 @@ int main(int argc, char ** argv) {
// export as [inp path]/ggml-model-[ftype].gguf // export as [inp path]/ggml-model-[ftype].gguf
fname_out = fpath + "ggml-model-" + ftype_str + ".gguf"; fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
arg_idx++; arg_idx++;
if (ftype_str == "COPY") {
params.only_copy = true;
}
} }
else { else {
fname_out = argv[arg_idx]; fname_out = argv[arg_idx];
@ -133,6 +143,10 @@ int main(int argc, char ** argv) {
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]); fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
return 1; return 1;
} else {
if (ftype_str == "COPY") {
params.only_copy = true;
}
} }
arg_idx++; arg_idx++;
} }

View file

@ -17,6 +17,8 @@
#include "completion.js.hpp" #include "completion.js.hpp"
#include "json-schema-to-grammar.mjs.hpp" #include "json-schema-to-grammar.mjs.hpp"
#include <cstddef>
#ifndef SERVER_VERBOSE #ifndef SERVER_VERBOSE
#define SERVER_VERBOSE 1 #define SERVER_VERBOSE 1
#endif #endif
@ -1054,7 +1056,7 @@ static json format_timings(llama_server_context &llama)
{ {
const auto timings = llama_get_timings(llama.ctx); const auto timings = llama_get_timings(llama.ctx);
assert(timings.n_eval == llama.num_tokens_predicted); assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
return json{ return json{
{"prompt_n", timings.n_p_eval}, {"prompt_n", timings.n_p_eval},
@ -1255,7 +1257,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
const llama_token * tokens = beams_state.beam_views[0].tokens; const llama_token * tokens = beams_state.beam_views[0].tokens;
const auto map = [](llama_token tok) { return completion_token_output{{},tok}; }; const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map); std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map);
printf("%lu", n); printf("%zu", n);
} }
fflush(stdout); fflush(stdout);
#if 0 // DEBUG: print current beams for this iteration #if 0 // DEBUG: print current beams for this iteration
@ -1393,7 +1395,13 @@ int main(int argc, char **argv)
} }
} }
const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs); auto probs = llama.generated_token_probs;
if (llama.params.n_probs > 0 && llama.stopped_word) {
const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
}
const json data = format_final_response(llama, llama.generated_text, probs);
llama_print_timings(llama.ctx); llama_print_timings(llama.ctx);
@ -1470,7 +1478,11 @@ int main(int argc, char **argv)
if (!llama.has_next_token) { if (!llama.has_next_token) {
// Generation is done, send extra information. // Generation is done, send extra information.
const json data = format_final_response(llama, "", llama.generated_token_probs); const json data = format_final_response(
llama,
"",
std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.begin() + sent_token_probs_index)
);
const std::string str = const std::string str =
"data: " + "data: " +
@ -1564,7 +1576,7 @@ int main(int argc, char **argv)
svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep)
{ {
const auto * fmt = "500 Internal Server Error\n%s"; const char fmt[] = "500 Internal Server Error\n%s";
char buf[BUFSIZ]; char buf[BUFSIZ];
try { try {
std::rethrow_exception(std::move(ep)); std::rethrow_exception(std::move(ep));

View file

@ -0,0 +1,8 @@
set(TARGET speculative)
add_executable(${TARGET} speculative.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -0,0 +1,234 @@
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "build-info.h"
#include "common.h"
#include "llama.h"
#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
int main(int argc, char ** argv) {
gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
if (params.model_draft.empty()) {
fprintf(stderr, "%s: error: --model-draft is required\n", __func__);
return 1;
}
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("speculative", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
// init llama.cpp
llama_backend_init(params.numa);
llama_model * model_tgt = NULL;
llama_model * model_dft = NULL;
llama_context * ctx_tgt = NULL;
llama_context * ctx_dft = NULL;
// load the target model
params.perplexity = true; // HACK: enable logits_all = true
std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
// load the draft model
params.model = params.model_draft;
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
// tokenize the prompt
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
const int max_context_size = llama_n_ctx(ctx_tgt);
const int max_tokens_list_size = max_context_size - 4;
if ((int) inp.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
return 1;
}
fprintf(stderr, "\n\n");
for (auto id : inp) {
fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
}
fflush(stderr);
const int n_input = inp.size();
const auto t_enc_start = ggml_time_us();
// eval the prompt with both models
llama_eval(ctx_tgt, inp.data(), int(inp.size() - 1), 0, params.n_threads);
llama_eval(ctx_tgt, &inp.back(), 1, inp.size() - 1, params.n_threads);
llama_eval(ctx_dft, inp.data(), int(inp.size()), 0, params.n_threads);
const auto t_enc_end = ggml_time_us();
// the 2 models should have the same vocab
const int n_ctx = llama_n_ctx(ctx_tgt);
const int n_vocab = llama_n_vocab(ctx_tgt);
//GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft));
// how many tokens to draft each time
const int n_draft = params.n_draft;
int n_predict = 0;
int n_drafted = 0;
int n_accept = 0;
int n_past_tgt = inp.size();
int n_past_dft = inp.size();
std::vector<llama_token> drafted;
std::vector<llama_token> last_tokens(n_ctx);
std::fill(last_tokens.begin(), last_tokens.end(), 0);
for (auto & id : inp) {
last_tokens.erase(last_tokens.begin());
last_tokens.push_back(id);
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
// used to determine end of generation
bool has_eos = false;
const auto t_dec_start = ggml_time_us();
while (true) {
LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
// sample from the drafted tokens if any
int i_dft = 0;
while (true) {
const llama_token id = llama_sample_token(ctx_tgt, NULL, NULL, params, last_tokens, candidates, i_dft);
last_tokens.erase(last_tokens.begin());
last_tokens.push_back(id);
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, last_tokens));
const std::string token_str = llama_token_to_piece(ctx_tgt, id);
printf("%s", token_str.c_str());
fflush(stdout);
if (id == llama_token_eos(ctx_tgt)) {
has_eos = true;
}
++n_predict;
if (i_dft < (int) drafted.size() && id == drafted[i_dft]) {
LOG("drafted token %d accepted\n", id);
++n_accept;
++n_past_tgt;
++n_past_dft;
++i_dft;
continue;
}
// the drafted token was rejected or we are out of drafted tokens
llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
++n_past_dft;
drafted.clear();
drafted.push_back(id);
break;
}
if (n_predict > params.n_predict || has_eos) {
break;
}
// sample n_draft tokens from the draft model picking the best token
int n_past_cur = n_past_dft;
for (int i = 0; i < n_draft; ++i) {
float * logits = llama_get_logits(ctx_dft);
candidates.clear();
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
// computes softmax and sorts the candidates
llama_sample_softmax(ctx_dft, &cur_p);
for (int i = 0; i < 3; ++i) {
LOG(" - draft candidate %d: %d (%.3f)\n", i, cur_p.data[i].id, cur_p.data[i].p);
}
// too low probability, stop drafting
if (cur_p.data[0].p < 2*cur_p.data[1].p) {
break;
}
drafted.push_back(cur_p.data[0].id);
++n_drafted;
if (i < n_draft - 1) {
// evaluate the drafted token on the draft model
llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads);
++n_past_cur;
}
}
// evaluate the target model on the drafted tokens
llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads);
++n_past_tgt;
drafted.erase(drafted.begin());
}
auto t_dec_end = ggml_time_us();
LOG_TEE("\n\n");
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
// TODO: make sure these numbers are computed correctly
LOG_TEE("\n");
LOG_TEE("n_draft = %d\n", n_draft);
LOG_TEE("n_predict = %d\n", n_predict);
LOG_TEE("n_drafted = %d\n", n_drafted);
LOG_TEE("n_accept = %d\n", n_accept);
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
LOG_TEE("\ndraft:\n");
llama_print_timings(ctx_dft);
LOG_TEE("\ntarget:\n");
llama_print_timings(ctx_tgt);
llama_free(ctx_tgt);
llama_free_model(model_tgt);
llama_free(ctx_dft);
llama_free_model(model_dft);
llama_backend_free();
fprintf(stderr, "\n\n");
return 0;
}

View file

@ -2,13 +2,16 @@
# train-text-from-scratch checkpoint --> gguf conversion # train-text-from-scratch checkpoint --> gguf conversion
import argparse import argparse
import gguf
import os import os
import struct import struct
import sys import sys
import numpy as np import numpy as np
from pathlib import Path from pathlib import Path
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py' / 'gguf'))
import gguf
# gguf constants # gguf constants
LLM_KV_OPTIMIZER_TYPE = "optimizer.type" LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
LLM_KV_OPTIMIZER_TYPE_ADAM = "adam" LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"

View file

@ -1,3 +1,8 @@
// defines MAP_ANONYMOUS
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml.h" #include "ggml.h"
#include <assert.h> #include <assert.h>
@ -6,6 +11,26 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#ifdef __has_include
#if __has_include(<unistd.h>)
#include <unistd.h>
#if defined(_POSIX_MAPPED_FILES)
#include <sys/types.h>
#include <sys/mman.h>
#endif
#endif
#endif
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <memoryapi.h>
#endif
#define UNUSED(x) (void)(x) #define UNUSED(x) (void)(x)
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES) #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
@ -99,19 +124,24 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
} }
#endif #endif
static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
return ggml_nbytes(tensor); return ggml_nbytes(tensor);
UNUSED(alloc); UNUSED(alloc);
} }
// check if a tensor is allocated by this buffer
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
void * ptr = tensor->data;
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
}
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
#endif #endif
size_t size = ggml_allocator_get_alloc_size(alloc, tensor); size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
size = aligned_offset(NULL, size, alloc->alignment); size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@ -178,17 +208,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
} }
// this is a very naive implementation, but for our case the number of free blocks should be very small // this is a very naive implementation, but for our case the number of free blocks should be very small
static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
void * ptr = tensor->data; void * ptr = tensor->data;
if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) { if (ggml_allocr_is_own(alloc, tensor) == false) {
// the tensor was not allocated in this buffer // the tensor was not allocated in this buffer
// this can happen because the graph allocator will try to free weights and other tensors from different buffers // this can happen because the graph allocator will try to free weights and other tensors from different buffers
// the easiest way to deal with this is just to ignore it // the easiest way to deal with this is just to ignore it
return; return;
} }
size_t size = ggml_allocator_get_alloc_size(alloc, tensor); size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
size = aligned_offset(NULL, size, alloc->alignment); size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks); AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size); AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
@ -283,17 +313,64 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
return alloc; return alloc;
} }
// address and size of the buffer when measuring // OS specific functions to allocate and free uncommitted virtual memory
// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers static void * alloc_vmem(size_t size) {
static void * const MEASURE_BASE_ADDR = (void *) (-(1ULL<<40) - 1); #if defined(_WIN32)
static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
#elif defined(_POSIX_MAPPED_FILES)
return mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
#else
// use a fixed address for other platforms
uintptr_t base_addr = (uintptr_t)-size - 0x100;
return (void *)base_addr;
#endif
}
static void free_vmem(void * base_addr, size_t size) {
#if defined(_WIN32)
VirtualFree(base_addr, 0, MEM_RELEASE);
UNUSED(size);
#elif defined(_POSIX_MAPPED_FILES)
munmap(base_addr, size);
#else
// nothing to do
UNUSED(base_addr);
UNUSED(size);
#endif
}
// allocate uncommitted virtual memory to measure the size of the graph
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
// 1TB for 64-bit, 1GB for 32-bit
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
do {
*base_addr = alloc_vmem(*size);
if (*base_addr != NULL) {
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
return;
}
// try again with half the size
*size /= 2;
} while (*size > 0);
GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
}
static void free_measure_vmem(void * base_addr, size_t size) {
free_vmem(base_addr, size);
}
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
void * base_addr;
size_t size;
alloc_measure_vmem(&base_addr, &size);
*alloc = (struct ggml_allocr){ *alloc = (struct ggml_allocr){
/*.data = */ MEASURE_BASE_ADDR, /*.data = */ base_addr,
/*.size = */ MEASURE_MAX_SIZE, /*.size = */ size,
/*.alignment = */ alignment, /*.alignment = */ alignment,
/*.n_free_blocks = */ 0, /*.n_free_blocks = */ 0,
/*.free_blocks = */ {{0}}, /*.free_blocks = */ {{0}},
@ -313,6 +390,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
} }
void ggml_allocr_free(struct ggml_allocr * alloc) { void ggml_allocr_free(struct ggml_allocr * alloc) {
if (alloc->measure) {
free_measure_vmem(alloc->data, alloc->size);
}
free(alloc); free(alloc);
} }
@ -382,8 +462,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
} }
// if the node's data is external, then we cannot re-use it // if the node's data is external, then we cannot re-use it
if ((char *) parent->data < (char *) alloc->data || if (ggml_allocr_is_own(alloc, parent) == false) {
(char *) parent->data >= ((char *) alloc->data + alloc->size)) {
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
continue; continue;
} }
@ -417,7 +496,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
} }
} }
static size_t ggml_allocator_alloc_graph_tensors_n( static size_t ggml_allocr_alloc_graph_tensors_n(
struct ggml_allocr * alloc, struct ggml_allocr * alloc,
struct ggml_cgraph ** graphs, int n_graphs, struct ggml_cgraph ** graphs, int n_graphs,
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) { struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@ -495,11 +574,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
AT_PRINTF("\n"); AT_PRINTF("\n");
} }
// update parents // update parents
// update immediately if there is no parse_seq // update immediately if there is no parse_seq
// update only at barriers if there is parse_seq // update only at barriers if there is parse_seq
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) { if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind; int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
int update_end = alloc->parse_seq_len ? ind : ind + 1; int update_end = alloc->parse_seq_len ? ind : ind + 1;
for (int i = update_start; i < update_end; i++) { for (int i = update_start; i < update_end; i++) {
@ -523,12 +601,12 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
view_src_hn->n_views -= 1; view_src_hn->n_views -= 1;
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views); AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) { if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
ggml_allocator_free_tensor(alloc, view_src); ggml_allocr_free_tensor(alloc, view_src);
} }
} }
else { else {
if (parent->data != node->data) { if (parent->data != node->data) {
ggml_allocator_free_tensor(alloc, parent); ggml_allocr_free_tensor(alloc, parent);
} }
} }
} }
@ -545,7 +623,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
for (int i = 0; outputs[g][i] != NULL; i++) { for (int i = 0; outputs[g][i] != NULL; i++) {
struct ggml_tensor * output = outputs[g][i]; struct ggml_tensor * output = outputs[g][i];
AT_PRINTF("output: %s\n", output->name); AT_PRINTF("output: %s\n", output->name);
ggml_allocator_free_tensor(alloc, output); ggml_allocr_free_tensor(alloc, output);
} }
} }
} }
@ -554,7 +632,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
} }
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) { size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL); return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
} }
size_t ggml_allocr_max_size(struct ggml_allocr * alloc) { size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {

View file

@ -81,12 +81,29 @@
#if defined(GGML_USE_HIPBLAS) #if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300 #define __CUDA_ARCH__ 1300
#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
static __device__ __forceinline__ int __vsubss4(const int a, const int b) { static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a); const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b); const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
#if __has_builtin(__builtin_elementwise_sub_sat)
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
return reinterpret_cast<const int&>(c); return reinterpret_cast<const int&>(c);
#else
int8x4_t c;
int16_t tmp;
#pragma unroll
for (int i = 0; i < 4; i++) {
tmp = va[i] - vb[i];
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
c[i] = tmp;
}
return reinterpret_cast<int&>(c);
#endif // __has_builtin(__builtin_elementwise_sub_sat)
} }
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {

View file

@ -76,6 +76,7 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(rms_norm); GGML_METAL_DECL_KERNEL(rms_norm);
GGML_METAL_DECL_KERNEL(norm); GGML_METAL_DECL_KERNEL(norm);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32); GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32); GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
@ -116,10 +117,24 @@ static NSString * const msl_library_source = @"see metal.metal";
struct ggml_metal_context * ggml_metal_init(int n_cb) { struct ggml_metal_context * ggml_metal_init(int n_cb) {
metal_printf("%s: allocating\n", __func__); metal_printf("%s: allocating\n", __func__);
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); // Show all the Metal device instances in the system
NSArray * devices = MTLCopyAllDevices();
id <MTLDevice> device;
NSString * s;
for (device in devices) {
s = [device name];
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
}
// Pick and show default Metal device
device = MTLCreateSystemDefaultDevice();
s = [device name];
metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
// Configure context
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
ctx->device = device;
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
ctx->device = MTLCreateSystemDefaultDevice();
ctx->queue = [ctx->device newCommandQueue]; ctx->queue = [ctx->device newCommandQueue];
ctx->n_buffers = 0; ctx->n_buffers = 0;
ctx->concur_list_len = 0; ctx->concur_list_len = 0;
@ -205,6 +220,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(rms_norm); GGML_METAL_ADD_KERNEL(rms_norm);
GGML_METAL_ADD_KERNEL(norm); GGML_METAL_ADD_KERNEL(norm);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32); GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32); GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
@ -270,6 +286,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(rms_norm); GGML_METAL_DEL_KERNEL(rms_norm);
GGML_METAL_DEL_KERNEL(norm); GGML_METAL_DEL_KERNEL(norm);
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32); GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32); GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32); GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32); GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
@ -680,6 +697,12 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_ADD: case GGML_OP_ADD:
{ {
GGML_ASSERT(ggml_is_contiguous(src0));
// utilize float4
GGML_ASSERT(ne00 % 4 == 0);
const int64_t nb = ne00/4;
if (ggml_nelements(src1) == ne10) { if (ggml_nelements(src1) == ne10) {
// src1 is a row // src1 is a row
[encoder setComputePipelineState:ctx->pipeline_add_row]; [encoder setComputePipelineState:ctx->pipeline_add_row];
@ -689,14 +712,20 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; [encoder setBytes:&nb length:sizeof(nb) atIndex:3];
const int64_t n = ggml_nelements(dst); const int64_t n = ggml_nelements(dst)/4;
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
case GGML_OP_MUL: case GGML_OP_MUL:
{ {
GGML_ASSERT(ggml_is_contiguous(src0));
// utilize float4
GGML_ASSERT(ne00 % 4 == 0);
const int64_t nb = ne00/4;
if (ggml_nelements(src1) == ne10) { if (ggml_nelements(src1) == ne10) {
// src1 is a row // src1 is a row
[encoder setComputePipelineState:ctx->pipeline_mul_row]; [encoder setComputePipelineState:ctx->pipeline_mul_row];
@ -706,9 +735,9 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; [encoder setBytes:&nb length:sizeof(nb) atIndex:3];
const int64_t n = ggml_nelements(dst); const int64_t n = ggml_nelements(dst)/4;
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
@ -840,9 +869,13 @@ void ggml_metal_graph_compute(
switch (src0t) { switch (src0t) {
case GGML_TYPE_F16: case GGML_TYPE_F16:
{ {
nth0 = 64; nth0 = 32;
nth1 = 1; nth1 = 1;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; if (ne11 * ne12 < 4) {
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
} else {
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
}
} break; } break;
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
{ {
@ -894,8 +927,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 2; nth0 = 4; //1;
nth1 = 32; nth1 = 8; //32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
} break; } break;
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
@ -943,9 +976,12 @@ void ggml_metal_graph_compute(
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17]; [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 || if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) { src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_Q3_K) { else if (src0t == GGML_TYPE_Q3_K) {
#ifdef GGML_QKK_64 #ifdef GGML_QKK_64
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@ -959,8 +995,8 @@ void ggml_metal_graph_compute(
else if (src0t == GGML_TYPE_Q6_K) { else if (src0t == GGML_TYPE_Q6_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else { } else {
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0]; int64_t ny = (ne11 + 3)/4;
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
} }
} break; } break;

View file

@ -25,9 +25,9 @@ typedef struct {
} block_q8_0; } block_q8_0;
kernel void kernel_add( kernel void kernel_add(
device const float * src0, device const float4 * src0,
device const float * src1, device const float4 * src1,
device float * dst, device float4 * dst,
uint tpig[[thread_position_in_grid]]) { uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] + src1[tpig]; dst[tpig] = src0[tpig] + src1[tpig];
} }
@ -35,18 +35,18 @@ kernel void kernel_add(
// assumption: src1 is a row // assumption: src1 is a row
// broadcast src1 into src0 // broadcast src1 into src0
kernel void kernel_add_row( kernel void kernel_add_row(
device const float * src0, device const float4 * src0,
device const float * src1, device const float4 * src1,
device float * dst, device float4 * dst,
constant int64_t & ne00, constant int64_t & nb,
uint tpig[[thread_position_in_grid]]) { uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] + src1[tpig % ne00]; dst[tpig] = src0[tpig] + src1[tpig % nb];
} }
kernel void kernel_mul( kernel void kernel_mul(
device const float * src0, device const float4 * src0,
device const float * src1, device const float4 * src1,
device float * dst, device float4 * dst,
uint tpig[[thread_position_in_grid]]) { uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] * src1[tpig]; dst[tpig] = src0[tpig] * src1[tpig];
} }
@ -54,12 +54,12 @@ kernel void kernel_mul(
// assumption: src1 is a row // assumption: src1 is a row
// broadcast src1 into src0 // broadcast src1 into src0
kernel void kernel_mul_row( kernel void kernel_mul_row(
device const float * src0, device const float4 * src0,
device const float * src1, device const float4 * src1,
device float * dst, device float4 * dst,
constant int64_t & ne00, constant int64_t & nb,
uint tpig[[thread_position_in_grid]]) { uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] * src1[tpig % ne00]; dst[tpig] = src0[tpig] * src1[tpig % nb];
} }
kernel void kernel_scale( kernel void kernel_scale(
@ -133,19 +133,24 @@ kernel void kernel_soft_max(
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
} }
// broadcast //// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
if (tpitg[0] == 0) { // the loop, and when that is done, buf[0] has the correct (synchronized) value
buf[0] = buf[0]; //if (tpitg[0] == 0) {
} // buf[0] = buf[0];
//}
threadgroup_barrier(mem_flags::mem_threadgroup); //threadgroup_barrier(mem_flags::mem_threadgroup);
const float max = buf[0]; const float max = buf[0];
// parallel sum // parallel sum
buf[tpitg[0]] = 0.0f; buf[tpitg[0]] = 0.0f;
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
buf[tpitg[0]] += exp(psrc0[i00] - max); const float exp_psrc0 = exp(psrc0[i00] - max);
buf[tpitg[0]] += exp_psrc0;
// Remember the result of exp here. exp is expensive, so we really do not
// whish to compute it twice.
pdst[i00] = exp_psrc0;
} }
// reduce // reduce
@ -157,17 +162,18 @@ kernel void kernel_soft_max(
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
} }
// broadcast // broadcast - not needed, see above
if (tpitg[0] == 0) { //// broadcast
buf[0] = buf[0]; //if (tpitg[0] == 0) {
} // buf[0] = buf[0];
//}
threadgroup_barrier(mem_flags::mem_threadgroup); //threadgroup_barrier(mem_flags::mem_threadgroup);
const float sum = buf[0]; const float sum = buf[0];
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
pdst[i00] = exp(psrc0[i00] - max) / sum; pdst[i00] /= sum;
} }
} }
@ -214,25 +220,27 @@ kernel void kernel_norm(
} }
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
} }
// broadcast //// broadcast
if (tpitg == 0) { //if (tpitg == 0) {
sum[0] /= ne00; // sum[0] /= ne00;
} //}
threadgroup_barrier(mem_flags::mem_threadgroup); //threadgroup_barrier(mem_flags::mem_threadgroup);
const float mean = sum[0]; const float mean = sum[0];
// recenter // recenter and VARIANCE
device float * y = dst + tgpig*ne00; device float * y = dst + tgpig*ne00;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
y[i00] = x[i00] - mean;
}
// VARIANCE
// parallel sum
sum[tpitg] = 0.0f; sum[tpitg] = 0.0f;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) { for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
y[i00] = x[i00] - mean;
sum[tpitg] += y[i00] * y[i00]; sum[tpitg] += y[i00] * y[i00];
} }
//// VARIANCE
//// parallel sum
//sum[tpitg] = 0.0f;
//for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
// sum[tpitg] += y[i00] * y[i00];
//}
// reduce // reduce
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
for (uint i = ntg/2; i > 0; i /= 2) { for (uint i = ntg/2; i > 0; i /= 2) {
@ -241,11 +249,11 @@ kernel void kernel_norm(
} }
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
} }
// broadcast //// broadcast
if (tpitg == 0) { //if (tpitg == 0) {
sum[0] /= ne00; // sum[0] /= ne00;
} //}
threadgroup_barrier(mem_flags::mem_threadgroup); //threadgroup_barrier(mem_flags::mem_threadgroup);
const float variance = sum[0]; const float variance = sum[0];
const float scale = 1.0f/sqrt(variance + eps); const float scale = 1.0f/sqrt(variance + eps);
@ -435,6 +443,8 @@ kernel void kernel_mul_mat_q4_1_f32(
mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg); mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
} }
#define NB_Q8_0 8
kernel void kernel_mul_mat_q8_0_f32( kernel void kernel_mul_mat_q8_0_f32(
device const void * src0, device const void * src0,
device const float * src1, device const float * src1,
@ -463,30 +473,30 @@ kernel void kernel_mul_mat_q8_0_f32(
device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0; device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
float yl[16]; float yl[NB_Q8_0];
float sumf[nr]={0.f}; float sumf[nr]={0.f};
const int ix = tiisg/2; const int ix = tiisg/4;
const int il = tiisg%2; const int il = tiisg%4;
device const float * yb = y + ix * QK8_0 + 16*il; device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
// each thread in a SIMD group deals with half a block. // each thread in a SIMD group deals with NB_Q8_0 quants at a time
for (int ib = ix; ib < nb; ib += nw/2) { for (int ib = ix; ib < nb; ib += nw/4) {
for (int i = 0; i < 16; ++i) { for (int i = 0; i < NB_Q8_0; ++i) {
yl[i] = yb[i]; yl[i] = yb[i];
} }
for (int row = 0; row < nr; row++) { for (int row = 0; row < nr; row++) {
device const int8_t * qs = x[ib+row*nb].qs + 16*il; device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
float sumq = 0.f; float sumq = 0.f;
for (int iq = 0; iq < 16; ++iq) { for (int iq = 0; iq < NB_Q8_0; ++iq) {
sumq += qs[iq] * yl[iq]; sumq += qs[iq] * yl[iq];
} }
sumf[row] += sumq*x[ib+row*nb].d; sumf[row] += sumq*x[ib+row*nb].d;
} }
yb += QK8_0 * 16; yb += NB_Q8_0 * nw;
} }
for (int row = 0; row < nr; ++row) { for (int row = 0; row < nr; ++row) {
@ -497,6 +507,60 @@ kernel void kernel_mul_mat_q8_0_f32(
} }
} }
kernel void kernel_mul_mat_f16_f32_1row(
device const char * src0,
device const char * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]]) {
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
const int64_t im = tgpig.z;
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
float sumf = 0;
if (ne00 < 128) {
for (int i = tiisg; i < ne00; i += 32) {
sumf += (float) x[i] * (float) y[i];
}
float all_sum = simd_sum(sumf);
if (tiisg == 0) {
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
}
} else {
device const half4 * x4 = (device const half4 *) x;
device const float4 * y4 = (device const float4 *) y;
for (int i = tiisg; i < ne00/4; i += 32) {
for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
}
float all_sum = simd_sum(sumf);
if (tiisg == 0) {
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
}
}
}
#define N_F16_F32 4
kernel void kernel_mul_mat_f16_f32( kernel void kernel_mul_mat_f16_f32(
device const char * src0, device const char * src0,
device const char * src1, device const char * src1,
@ -515,37 +579,58 @@ kernel void kernel_mul_mat_f16_f32(
constant uint64_t & nb12, constant uint64_t & nb12,
constant int64_t & ne0, constant int64_t & ne0,
constant int64_t & ne1, constant int64_t & ne1,
threadgroup float * sum [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]], uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpig[[thread_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) {
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 tptg[[threads_per_threadgroup]]) {
const int64_t r0 = tgpig.x; const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y; const int64_t rb = tgpig.y*N_F16_F32;
const int64_t im = tgpig.z; const int64_t im = tgpig.z;
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
sum[tpitg.x] = 0.0f; if (ne00 < 128) {
for (int row = 0; row < N_F16_F32; ++row) {
int r1 = rb + row;
if (r1 >= ne11) {
break;
}
for (int i = tpitg.x; i < ne00; i += tptg.x) { device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
sum[tpitg.x] += (float) x[i] * (float) y[i];
}
// accumulate the sum from all threads in the threadgroup float sumf = 0;
threadgroup_barrier(mem_flags::mem_threadgroup); for (int i = tiisg; i < ne00; i += 32) {
for (uint i = tptg.x/2; i > 0; i /= 2) { sumf += (float) x[i] * (float) y[i];
if (tpitg.x < i) { }
sum[tpitg.x] += sum[tpitg.x + i];
float all_sum = simd_sum(sumf);
if (tiisg == 0) {
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
}
}
} else {
device const half4 * x4 = (device const half4 *)x;
for (int row = 0; row < N_F16_F32; ++row) {
int r1 = rb + row;
if (r1 >= ne11) {
break;
}
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
device const float4 * y4 = (device const float4 *) y;
float sumf = 0;
for (int i = tiisg; i < ne00/4; i += 32) {
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
}
float all_sum = simd_sum(sumf);
if (tiisg == 0) {
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
}
} }
threadgroup_barrier(mem_flags::mem_threadgroup);
} }
if (tpitg.x == 0) {
dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
}
} }
kernel void kernel_alibi_f32( kernel void kernel_alibi_f32(
@ -1244,7 +1329,8 @@ kernel void kernel_mul_mat_q4_K_f32(
const int r0 = tgpig.x; const int r0 = tgpig.x;
const int r1 = tgpig.y; const int r1 = tgpig.y;
const int r2 = tgpig.z; const int r2 = tgpig.z;
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
const int first_row = r0 * N_DST;
const int ib_row = first_row * nb; const int ib_row = first_row * nb;
const uint offset0 = r2/gqa*(nb*ne0); const uint offset0 = r2/gqa*(nb*ne0);
device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0; device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;

View file

@ -1493,7 +1493,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
d_X = (cl_mem) src0->data; d_X = (cl_mem) src0->data;
} else { } else {
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size); d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
} }
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);

273
ggml.c
View file

@ -301,6 +301,10 @@ typedef double ggml_float;
#endif #endif
#endif #endif
#ifdef __riscv_v_intrinsic
#include <riscv_vector.h>
#endif
#ifdef __F16C__ #ifdef __F16C__
#ifdef _MSC_VER #ifdef _MSC_VER
@ -813,46 +817,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
#if !defined(__aarch64__) #if !defined(__aarch64__)
inline static uint16_t vaddvq_u8(uint8x16_t v) {
return
(uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
(uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
(uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
(uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
(uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
(uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
(uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
(uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
}
inline static int16_t vaddvq_s8(int8x16_t v) {
return
(int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
(int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
(int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
(int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
(int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
(int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
(int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
(int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
}
inline static int32_t vaddvq_s16(int16x8_t v) {
return
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
}
inline static uint32_t vaddvq_u16(uint16x8_t v) {
return
(uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
(uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
(uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
(uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
}
inline static int32_t vaddvq_s32(int32x4_t v) { inline static int32_t vaddvq_s32(int32x4_t v) {
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
} }
@ -861,12 +825,6 @@ inline static float vaddvq_f32(float32x4_t v) {
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
} }
inline static float vminvq_f32(float32x4_t v) {
return
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
}
inline static float vmaxvq_f32(float32x4_t v) { inline static float vmaxvq_f32(float32x4_t v) {
return return
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
@ -2677,6 +2635,41 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
} }
*s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
#elif defined(__riscv_v_intrinsic)
float sumf = 0.0;
size_t vl = __riscv_vsetvl_e8m1(qk/2);
for (int i = 0; i < nb; i++) {
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
}
*s = sumf;
#else #else
// scalar // scalar
float sumf = 0.0; float sumf = 0.0;
@ -2803,6 +2796,38 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
} }
*s = hsum_float_8(acc) + summs; *s = hsum_float_8(acc) + summs;
#elif defined(__riscv_v_intrinsic)
float sumf = 0.0;
size_t vl = __riscv_vsetvl_e8m1(qk/2);
for (int i = 0; i < nb; i++) {
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
}
*s = sumf;
#else #else
// scalar // scalar
float sumf = 0.0; float sumf = 0.0;
@ -3037,6 +3062,76 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
} }
*s = hsum_float_8(acc); *s = hsum_float_8(acc);
#elif defined(__riscv_v_intrinsic)
float sumf = 0.0;
uint32_t qh;
// These temp values are for masking and shift operations
uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
size_t vl = __riscv_vsetvl_e8m1(qk/2);
for (int i = 0; i < nb; i++) {
memcpy(&qh, x[i].qh, sizeof(uint32_t));
// temporary registers
vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);
// ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl);
vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl);
vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
// ((qh & (1u << (j + 16))) >> (j + 12));
vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl);
vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl);
// narrowing
vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl);
vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl);
vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
// load
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl);
vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl);
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
}
*s = sumf;
#else #else
// scalar // scalar
float sumf = 0.0; float sumf = 0.0;
@ -3293,6 +3388,72 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
} }
*s = hsum_float_8(acc) + summs; *s = hsum_float_8(acc) + summs;
#elif defined(__riscv_v_intrinsic)
float sumf = 0.0;
uint32_t qh;
// These temp values are for shift operations
uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
size_t vl = __riscv_vsetvl_e8m1(qk/2);
for (int i = 0; i < nb; i++) {
memcpy(&qh, x[i].qh, sizeof(uint32_t));
// temporary registers
vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);
// load qh
vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl);
// ((qh >> (j + 0)) << 4) & 0x10;
vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl);
vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl);
// ((qh >> (j + 12)) ) & 0x10;
vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl);
vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl);
// narrowing
vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl);
vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl);
vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
// load
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
}
*s = sumf;
#else #else
// scalar // scalar
float sumf = 0.0; float sumf = 0.0;
@ -3404,6 +3565,26 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
} }
*s = hsum_float_8(acc); *s = hsum_float_8(acc);
#elif defined(__riscv_v_intrinsic)
float sumf = 0.0;
size_t vl = __riscv_vsetvl_e8m1(qk);
for (int i = 0; i < nb; i++) {
// load elements
vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
}
*s = sumf;
#else #else
// scalar // scalar
float sumf = 0.0; float sumf = 0.0;

View file

@ -801,7 +801,7 @@ class SpecialVocab:
else: else:
continue continue
for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content): for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content):
if isinstance(maybe_token_id, int): if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
self.special_token_ids[typ] = maybe_token_id self.special_token_ids[typ] = maybe_token_id
break break
return True return True
@ -814,7 +814,7 @@ class SpecialVocab:
config = json.load(f) config = json.load(f)
for typ in self.special_token_types: for typ in self.special_token_types:
maybe_token_id = config.get(f'{typ}_token_id') maybe_token_id = config.get(f'{typ}_token_id')
if isinstance(maybe_token_id, int): if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
self.special_token_ids[typ] = maybe_token_id self.special_token_ids[typ] = maybe_token_id
return True return True

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "gguf" name = "gguf"
version = "0.3.1" version = "0.3.2"
description = "Write ML models in GGUF for GGML" description = "Write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"] authors = ["GGML <ggml@ggml.ai>"]
packages = [ packages = [

42
grammars/c.gbnf Normal file
View file

@ -0,0 +1,42 @@
root ::= (declaration)*
declaration ::= dataType identifier "(" parameter? ")" "{" statement* "}"
dataType ::= "int" ws | "float" ws | "char" ws
identifier ::= [a-zA-Z_] [a-zA-Z_0-9]*
parameter ::= dataType identifier
statement ::=
( dataType identifier ws "=" ws expression ";" ) |
( identifier ws "=" ws expression ";" ) |
( identifier ws "(" argList? ")" ";" ) |
( "return" ws expression ";" ) |
( "while" "(" condition ")" "{" statement* "}" ) |
( "for" "(" forInit ";" ws condition ";" ws forUpdate ")" "{" statement* "}" ) |
( "if" "(" condition ")" "{" statement* "}" ("else" "{" statement* "}")? ) |
( singleLineComment ) |
( multiLineComment )
forInit ::= dataType identifier ws "=" ws expression | identifier ws "=" ws expression
forUpdate ::= identifier ws "=" ws expression
condition ::= expression relationOperator expression
relationOperator ::= ("<=" | "<" | "==" | "!=" | ">=" | ">")
expression ::= term (("+" | "-") term)*
term ::= factor(("*" | "/") factor)*
factor ::= identifier | number | unaryTerm | funcCall | parenExpression
unaryTerm ::= "-" factor
funcCall ::= identifier "(" argList? ")"
parenExpression ::= "(" ws expression ws ")"
argList ::= expression ("," ws expression)*
number ::= [0-9]+
singleLineComment ::= "//" [^\n]* "\n"
multiLineComment ::= "/*" ( [^*] | ("*" [^/]) )* "*/"
ws ::= ([ \t\n]+)

View file

@ -13,6 +13,26 @@
// //
#include <arm_neon.h> #include <arm_neon.h>
#if !defined(__aarch64__)
inline static int32_t vaddvq_s16(int16x8_t v) {
return
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
}
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
return vcombine_s16(a0, b0);
}
inline static int32_t vaddvq_s32(int32x4_t v) {
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
}
#endif
#else #else
#ifdef __wasm_simd128__ #ifdef __wasm_simd128__
@ -183,13 +203,9 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
int ntry, float alpha) { int ntry, float alpha) {
float min = x[0]; float min = x[0];
float max = x[0]; float max = x[0];
float sum_x = 0;
float sum_x2 = 0;
for (int i = 1; i < n; ++i) { for (int i = 1; i < n; ++i) {
if (x[i] < min) min = x[i]; if (x[i] < min) min = x[i];
if (x[i] > max) max = x[i]; if (x[i] > max) max = x[i];
sum_x += x[i];
sum_x2 += x[i]*x[i];
} }
if (max == min) { if (max == min) {
for (int i = 0; i < n; ++i) L[i] = 0; for (int i = 0; i < n; ++i) L[i] = 0;
@ -1306,7 +1322,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
const uint8x16_t m3 = vdupq_n_u8(0x3); const uint8x16_t m3 = vdupq_n_u8(0x3);
const uint8x16_t m4 = vdupq_n_u8(0xF); const uint8x16_t m4 = vdupq_n_u8(0xF);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t vzero = vdupq_n_s32(0); const int32x4_t vzero = vdupq_n_s32(0);
#endif
int8x16x2_t q2bytes; int8x16x2_t q2bytes;
uint8_t aux[16]; uint8_t aux[16];
@ -1612,7 +1630,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
#ifdef __ARM_NEON #ifdef __ARM_NEON
const uint8x16_t m3 = vdupq_n_u8(0x3); const uint8x16_t m3 = vdupq_n_u8(0x3);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t vzero = vdupq_n_s32(0); const int32x4_t vzero = vdupq_n_s32(0);
#endif
int8x16x4_t q2bytes; int8x16x4_t q2bytes;
@ -2060,7 +2080,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
__m256 acc = _mm256_setzero_ps(); __m256 acc = _mm256_setzero_ps();
uint32_t *aux; const uint32_t *aux;
for (int i = 0; i < nb; ++i) { for (int i = 0; i < nb; ++i) {
@ -2070,7 +2090,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
const int8_t * restrict q8 = y[i].qs; const int8_t * restrict q8 = y[i].qs;
// Set up scales // Set up scales
aux = (uint32_t *)x[i].scales; aux = (const uint32_t *)x[i].scales;
__m128i scales128 = _mm_set_epi32( __m128i scales128 = _mm_set_epi32(
((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4), ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4), ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
@ -2596,8 +2616,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
const uint8_t * restrict q4 = x[i].qs; const uint8_t * restrict q4 = x[i].qs;
const int8_t * restrict q8 = y[i].qs; const int8_t * restrict q8 = y[i].qs;
//int32x4_t isum = mzero;
int32_t sumi1 = 0; int32_t sumi1 = 0;
int32_t sumi2 = 0; int32_t sumi2 = 0;
@ -3096,9 +3114,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
#ifdef __ARM_NEON #ifdef __ARM_NEON
const uint8x16_t m4b = vdupq_n_u8(0xf); const uint8x16_t m4b = vdupq_n_u8(0xf);
const int32x4_t mzero = vdupq_n_s32(0);
const uint8x16_t mone = vdupq_n_u8(1); const uint8x16_t mone = vdupq_n_u8(1);
const uint8x16_t mtwo = vdupq_n_u8(2); const uint8x16_t mtwo = vdupq_n_u8(2);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t mzero = vdupq_n_s32(0);
#endif
int8x16x4_t q5bytes; int8x16x4_t q5bytes;
@ -3441,8 +3461,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
#ifdef __ARM_NEON #ifdef __ARM_NEON
const uint8x16_t m4b = vdupq_n_u8(0xf); const uint8x16_t m4b = vdupq_n_u8(0xf);
const int32x4_t mzero = vdupq_n_s32(0);
const uint8x16_t mh = vdupq_n_u8(16); const uint8x16_t mh = vdupq_n_u8(16);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t mzero = vdupq_n_s32(0);
#endif
int8x16x4_t q5bytes; int8x16x4_t q5bytes;
uint8x16x4_t q5h; uint8x16x4_t q5h;
@ -3660,7 +3682,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
float sum = 0; float sum = 0;
const uint8x16_t m4b = vdupq_n_u8(0xF); const uint8x16_t m4b = vdupq_n_u8(0xF);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t vzero = vdupq_n_s32(0); const int32x4_t vzero = vdupq_n_s32(0);
#endif
//const int8x16_t m32s = vdupq_n_s8(32); //const int8x16_t m32s = vdupq_n_s8(32);
const uint8x16_t mone = vdupq_n_u8(3); const uint8x16_t mone = vdupq_n_u8(3);
@ -4049,8 +4073,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
float sum = 0; float sum = 0;
const uint8x16_t m4b = vdupq_n_u8(0xF); const uint8x16_t m4b = vdupq_n_u8(0xF);
const int32x4_t vzero = vdupq_n_s32(0);
const int8x16_t m32s = vdupq_n_s8(32); const int8x16_t m32s = vdupq_n_s8(32);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t vzero = vdupq_n_s32(0);
#endif
const uint8x16_t mone = vdupq_n_u8(3); const uint8x16_t mone = vdupq_n_u8(3);

View file

@ -325,6 +325,44 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
}, },
}, },
{
LLM_ARCH_GPT2,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
},
},
{
LLM_ARCH_GPTJ,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
},
},
{
LLM_ARCH_GPTNEOX,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_MPT,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
},
},
{
LLM_ARCH_UNKNOWN,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
},
},
}; };
static llm_arch llm_arch_from_string(const std::string & name) { static llm_arch llm_arch_from_string(const std::string & name) {
@ -1605,9 +1643,13 @@ static void llm_load_hparams(
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT)); GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
if (hparams.n_rot != hparams.n_embd / hparams.n_head) { if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head)); if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
}
} }
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
// gpt-j n_rot = rotary_dim
} }
// arch-specific KVs // arch-specific KVs
@ -3324,9 +3366,15 @@ struct llm_tokenizer_bpe {
std::string byte_str(1, *j); std::string byte_str(1, *j);
auto token_multibyte = vocab.token_to_id.find(byte_str); auto token_multibyte = vocab.token_to_id.find(byte_str);
if (token_multibyte == vocab.token_to_id.end()) { if (token_multibyte == vocab.token_to_id.end()) {
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str()); try {
llama_token token_byte = llama_byte_to_token(vocab, *j);
output.push_back(token_byte);
} catch (const std::out_of_range & err) {
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
}
} else {
output.push_back((*token_multibyte).second);
} }
output.push_back((*token_multibyte).second);
} }
} else { } else {
output.push_back((*token).second); output.push_back((*token).second);
@ -3600,7 +3648,7 @@ static void llama_grammar_advance_stack(
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) { std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
if (stack.empty()) { if (stack.empty()) {
new_stacks.push_back(stack); new_stacks.emplace_back(stack);
return; return;
} }
@ -3637,7 +3685,7 @@ static void llama_grammar_advance_stack(
} }
case LLAMA_GRETYPE_CHAR: case LLAMA_GRETYPE_CHAR:
case LLAMA_GRETYPE_CHAR_NOT: case LLAMA_GRETYPE_CHAR_NOT:
new_stacks.push_back(stack); new_stacks.emplace_back(stack);
break; break;
default: default:
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
@ -4393,7 +4441,7 @@ struct llama_logit_info {
} }
return min_heap; return min_heap;
} }
float probability_from_logit(float logit) { float probability_from_logit(float logit) const {
return normalizer * std::exp(logit - max_l); return normalizer * std::exp(logit - max_l);
} }
}; };
@ -4683,6 +4731,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
llm_load_arch(*ml, model); llm_load_arch(*ml, model);
llm_load_hparams(*ml, model, 0, 0, 0); llm_load_hparams(*ml, model, 0, 0, 0);
if (params->only_copy) {
ftype = model.ftype;
}
const size_t align = GGUF_DEFAULT_ALIGNMENT; const size_t align = GGUF_DEFAULT_ALIGNMENT;
struct gguf_context * ctx_out = gguf_init_empty(); struct gguf_context * ctx_out = gguf_init_empty();
@ -4769,18 +4821,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// quantize only 2D tensors // quantize only 2D tensors
quantize &= (tensor->n_dims == 2); quantize &= (tensor->n_dims == 2);
quantize &= params->quantize_output_tensor || name != "output.weight"; quantize &= params->quantize_output_tensor || name != "output.weight";
quantize &= quantized_type != tensor->type; quantize &= !params->only_copy;
enum ggml_type new_type; enum ggml_type new_type;
void * new_data; void * new_data;
size_t new_size; size_t new_size;
if (!quantize) { if (quantize) {
new_type = tensor->type;
new_data = tensor->data;
new_size = ggml_nbytes(tensor);
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
} else {
new_type = quantized_type; new_type = quantized_type;
#ifdef GGML_USE_K_QUANTS #ifdef GGML_USE_K_QUANTS
// TODO: avoid hardcoded tensor names - use the TN_* constants // TODO: avoid hardcoded tensor names - use the TN_* constants
@ -4879,7 +4926,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} }
} }
#endif #endif
// If we've decided to quantize to the same type the tensor is already
// in then there's nothing to do.
quantize = tensor->type != new_type;
}
if (!quantize) {
new_type = tensor->type;
new_data = tensor->data;
new_size = ggml_nbytes(tensor);
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
} else {
const size_t nelements = ggml_nelements(tensor); const size_t nelements = ggml_nelements(tensor);
float * f32_data; float * f32_data;
@ -5311,6 +5367,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
/*.allow_requantize =*/ false, /*.allow_requantize =*/ false,
/*.quantize_output_tensor =*/ true, /*.quantize_output_tensor =*/ true,
/*.only_copy =*/ false,
}; };
return result; return result;

View file

@ -164,6 +164,7 @@ extern "C" {
enum llama_ftype ftype; // quantize to this llama_ftype enum llama_ftype ftype; // quantize to this llama_ftype
bool allow_requantize; // allow quantizing non-f32/f16 tensors bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
} llama_model_quantize_params; } llama_model_quantize_params;
// grammar types // grammar types