Merge remote-tracking branch 'origin/master' into cmake-find-package

Resolved conflicts in CMakeLists.txt.
This commit is contained in:
Mason M 2023-09-06 09:07:39 -03:00
commit 872cff8570
39 changed files with 3858 additions and 2807 deletions

View file

@ -17,3 +17,6 @@ indent_style = tab
[prompts/*.txt] [prompts/*.txt]
insert_final_newline = unset insert_final_newline = unset
[examples/server/public/*]
indent_size = 2

View file

@ -18,7 +18,6 @@ on:
env: env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }} BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GGML_NLOOP: 3 GGML_NLOOP: 3
GGML_NITER: 1
GGML_N_THREADS: 1 GGML_N_THREADS: 1
jobs: jobs:

36
.github/workflows/code-coverage.yml vendored Normal file
View file

@ -0,0 +1,36 @@
name: Code Coverage
on: [push, pull_request]
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
jobs:
run:
runs-on: ubuntu-20.04
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential gcc-8 lcov
- name: Build
run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
- name: Run tests
run: CC=gcc-8 make test
- name: Generate coverage report
run: |
make coverage
make lcov-report
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
with:
files: lcov-report/coverage.info

37
.gitignore vendored
View file

@ -6,6 +6,10 @@
*.exe *.exe
*.dll *.dll
*.log *.log
*.gcov
*.gcno
*.gcda
*.dot
.DS_Store .DS_Store
.build/ .build/
.cache/ .cache/
@ -17,6 +21,9 @@
.vs/ .vs/
.vscode/ .vscode/
lcov-report/
gcovr-report/
build*/ build*/
out/ out/
tmp/ tmp/
@ -24,27 +31,29 @@ tmp/
models/* models/*
models-mnt models-mnt
/main
/quantize
/quantize-stats
/result
/perplexity
/embedding
/train-text-from-scratch
/convert-llama2c-to-ggml
/simple
/benchmark-matmult
/vdot
/server
/Pipfile /Pipfile
/baby-llama
/beam-search
/benchmark-matmult
/convert-llama2c-to-ggml
/embd-input-test /embd-input-test
/embedding
/gguf /gguf
/gguf-llama-simple /gguf-llama-simple
/libllama.so /libllama.so
/llama-bench /llama-bench
/baby-llama /main
/beam-search /metal
/perplexity
/quantize
/quantize-stats
/result
/save-load-state /save-load-state
/server
/simple
/speculative
/train-text-from-scratch
/vdot
build-info.h build-info.h
arm_neon.h arm_neon.h
compile_commands.json compile_commands.json

View file

@ -36,6 +36,12 @@ endif()
# Option list # Option list
# #
if (APPLE)
set(LLAMA_METAL_DEFAULT ON)
else()
set(LLAMA_METAL_DEFAULT OFF)
endif()
# general # general
option(LLAMA_STATIC "llama: static link libraries" OFF) option(LLAMA_STATIC "llama: static link libraries" OFF)
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF) option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
@ -76,7 +82,8 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON) option(LLAMA_K_QUANTS "llama: use k-quants" ON)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
@ -158,6 +165,32 @@ if (APPLE AND LLAMA_ACCELERATE)
endif() endif()
endif() endif()
if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
message(STATUS "Metal framework found")
set(GGML_HEADERS_METAL ggml-metal.h)
set(GGML_SOURCES_METAL ggml-metal.m)
add_compile_definitions(GGML_USE_METAL)
if (LLAMA_METAL_NDEBUG)
add_compile_definitions(GGML_METAL_NDEBUG)
endif()
# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
)
endif()
if (LLAMA_BLAS) if (LLAMA_BLAS)
if (LLAMA_STATIC) if (LLAMA_STATIC)
set(BLA_STATIC ON) set(BLA_STATIC ON)
@ -295,30 +328,6 @@ if (LLAMA_CUBLAS)
endif() endif()
endif() endif()
if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
set(GGML_HEADERS_METAL ggml-metal.h)
set(GGML_SOURCES_METAL ggml-metal.m)
add_compile_definitions(GGML_USE_METAL)
#add_compile_definitions(GGML_METAL_NDEBUG)
# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
)
endif()
if (LLAMA_MPI) if (LLAMA_MPI)
cmake_minimum_required(VERSION 3.10) cmake_minimum_required(VERSION 3.10)
find_package(MPI) find_package(MPI)

247
Makefile
View file

@ -1,9 +1,45 @@
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search tests/test-c.o BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
# Binaries only useful for tests # Binaries only useful for tests
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1
# Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
ifndef UNAME_S
UNAME_S := $(shell uname -s)
endif
ifndef UNAME_P
UNAME_P := $(shell uname -p)
endif
ifndef UNAME_M
UNAME_M := $(shell uname -m)
endif
# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin)
ifndef LLAMA_NO_METAL
LLAMA_METAL := 1
endif
ifneq ($(UNAME_P),arm)
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
ifeq ($(SYSCTL_M),1)
# UNAME_P := arm
# UNAME_M := arm64
warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
endif
endif
endif
ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
BUILD_TARGETS += metal
endif
default: $(BUILD_TARGETS) default: $(BUILD_TARGETS)
test: test:
@ -23,17 +59,17 @@ test:
all: $(BUILD_TARGETS) $(TEST_TARGETS) all: $(BUILD_TARGETS) $(TEST_TARGETS)
ifndef UNAME_S coverage: ## Run code coverage
UNAME_S := $(shell uname -s) gcov -pb tests/*.cpp
endif
ifndef UNAME_P lcov-report: coverage ## Generate lcov report
UNAME_P := $(shell uname -p) mkdir -p lcov-report
endif lcov --capture --directory . --output-file lcov-report/coverage.info
genhtml lcov-report/coverage.info --output-directory lcov-report
ifndef UNAME_M gcovr-report: coverage ## Generate gcovr report
UNAME_M := $(shell uname -m) mkdir -p gcovr-report
endif gcovr --root . --html --html-details --output gcovr-report/coverage.html
ifdef RISCV_CROSS_COMPILE ifdef RISCV_CROSS_COMPILE
CC := riscv64-unknown-linux-gnu-gcc CC := riscv64-unknown-linux-gnu-gcc
@ -43,19 +79,6 @@ endif
CCV := $(shell $(CC) --version | head -n 1) CCV := $(shell $(CC) --version | head -n 1)
CXXV := $(shell $(CXX) --version | head -n 1) CXXV := $(shell $(CXX) --version | head -n 1)
# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin)
ifneq ($(UNAME_P),arm)
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
ifeq ($(SYSCTL_M),1)
# UNAME_P := arm
# UNAME_M := arm64
warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
endif
endif
endif
# #
# Compile flags # Compile flags
# #
@ -67,63 +90,47 @@ OPT = -Ofast
else else
OPT = -O3 OPT = -O3
endif endif
CFLAGS = -I. $(OPT) -std=c11 -fPIC MK_CPPFLAGS = -I. -Icommon
CXXFLAGS = -I. -I./common $(OPT) -std=c++11 -fPIC MK_CFLAGS = $(CPPFLAGS) $(OPT) -std=c11 -fPIC
LDFLAGS = MK_CXXFLAGS = $(CPPFLAGS) $(OPT) -std=c++11 -fPIC
MK_LDFLAGS =
ifdef LLAMA_DEBUG ifdef LLAMA_DEBUG
CFLAGS += -O0 -g MK_CFLAGS += -O0 -g
CXXFLAGS += -O0 -g MK_CXXFLAGS += -O0 -g
LDFLAGS += -g MK_LDFLAGS += -g
else else
CFLAGS += -DNDEBUG MK_CPPFLAGS += -DNDEBUG
CXXFLAGS += -DNDEBUG
endif endif
ifdef LLAMA_SERVER_VERBOSE ifdef LLAMA_SERVER_VERBOSE
CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE) MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
endif
ifdef LLAMA_CODE_COVERAGE
MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
endif endif
ifdef LLAMA_DISABLE_LOGS ifdef LLAMA_DISABLE_LOGS
CFLAGS += -DLOG_DISABLE_LOGS MK_CPPFLAGS += -DLOG_DISABLE_LOGS
CXXFLAGS += -DLOG_DISABLE_LOGS
endif # LLAMA_DISABLE_LOGS endif # LLAMA_DISABLE_LOGS
# warnings # warnings
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \ MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function -Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
ifeq '' '$(findstring clang++,$(CXX))' ifeq '' '$(findstring clang++,$(CXX))'
# g++ only # g++ only
CXXFLAGS += -Wno-format-truncation MK_CXXFLAGS += -Wno-format-truncation
endif endif
# OS specific # OS specific
# TODO: support Windows # TODO: support Windows
ifeq ($(UNAME_S),Linux) ifneq '' '$(filter $(UNAME_S),Linux Darwin FreeBSD NetBSD OpenBSD Haiku)'
CFLAGS += -pthread MK_CFLAGS += -pthread
CXXFLAGS += -pthread MK_CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),Darwin)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),FreeBSD)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),NetBSD)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),OpenBSD)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),Haiku)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif endif
# detect Windows # detect Windows
@ -149,12 +156,11 @@ ifeq ($(_WIN32),1)
endif endif
ifdef LLAMA_GPROF ifdef LLAMA_GPROF
CFLAGS += -pg MK_CFLAGS += -pg
CXXFLAGS += -pg MK_CXXFLAGS += -pg
endif endif
ifdef LLAMA_PERF ifdef LLAMA_PERF
CFLAGS += -DGGML_PERF MK_CPPFLAGS += -DGGML_PERF
CXXFLAGS += -DGGML_PERF
endif endif
# Architecture specific # Architecture specific
@ -165,104 +171,102 @@ ifndef RISCV
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
# Use all CPU extensions that are available: # Use all CPU extensions that are available:
CFLAGS += -march=native -mtune=native MK_CFLAGS += -march=native -mtune=native
CXXFLAGS += -march=native -mtune=native MK_CXXFLAGS += -march=native -mtune=native
# Usage AVX-only # Usage AVX-only
#CFLAGS += -mfma -mf16c -mavx #MK_CFLAGS += -mfma -mf16c -mavx
#CXXFLAGS += -mfma -mf16c -mavx #MK_CXXFLAGS += -mfma -mf16c -mavx
# Usage SSSE3-only (Not is SSE3!) # Usage SSSE3-only (Not is SSE3!)
#CFLAGS += -mssse3 #MK_CFLAGS += -mssse3
#CXXFLAGS += -mssse3 #MK_CXXFLAGS += -mssse3
endif endif
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves. # The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
# https://github.com/ggerganov/llama.cpp/issues/2922 # https://github.com/ggerganov/llama.cpp/issues/2922
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))' ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
CFLAGS += -Xassembler -muse-unaligned-vector-move MK_CFLAGS += -Xassembler -muse-unaligned-vector-move
CXXFLAGS += -Xassembler -muse-unaligned-vector-move MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
endif endif
ifneq ($(filter aarch64%,$(UNAME_M)),) ifneq ($(filter aarch64%,$(UNAME_M)),)
# Apple M1, M2, etc. # Apple M1, M2, etc.
# Raspberry Pi 3, 4, Zero 2 (64-bit) # Raspberry Pi 3, 4, Zero 2 (64-bit)
CFLAGS += -mcpu=native MK_CFLAGS += -mcpu=native
CXXFLAGS += -mcpu=native MK_CXXFLAGS += -mcpu=native
endif endif
ifneq ($(filter armv6%,$(UNAME_M)),) ifneq ($(filter armv6%,$(UNAME_M)),)
# Raspberry Pi 1, Zero # Raspberry Pi 1, Zero
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access MK_CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
endif endif
ifneq ($(filter armv7%,$(UNAME_M)),) ifneq ($(filter armv7%,$(UNAME_M)),)
# Raspberry Pi 2 # Raspberry Pi 2
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations MK_CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
endif endif
ifneq ($(filter armv8%,$(UNAME_M)),) ifneq ($(filter armv8%,$(UNAME_M)),)
# Raspberry Pi 3, 4, Zero 2 (32-bit) # Raspberry Pi 3, 4, Zero 2 (32-bit)
CFLAGS += -mfp16-format=ieee -mno-unaligned-access MK_CFLAGS += -mfp16-format=ieee -mno-unaligned-access
MK_CXXFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif endif
ifneq ($(filter ppc64%,$(UNAME_M)),) ifneq ($(filter ppc64%,$(UNAME_M)),)
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
ifneq (,$(findstring POWER9,$(POWER9_M))) ifneq (,$(findstring POWER9,$(POWER9_M)))
CFLAGS += -mcpu=power9 MK_CFLAGS += -mcpu=power9
CXXFLAGS += -mcpu=power9 MK_CXXFLAGS += -mcpu=power9
endif
# Require c++23's std::byteswap for big-endian support.
ifeq ($(UNAME_M),ppc64)
CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
endif endif
endif endif
else else
CFLAGS += -march=rv64gcv -mabi=lp64d MK_CFLAGS += -march=rv64gcv -mabi=lp64d
CXXFLAGS += -march=rv64gcv -mabi=lp64d MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
endif endif
ifndef LLAMA_NO_K_QUANTS ifndef LLAMA_NO_K_QUANTS
CFLAGS += -DGGML_USE_K_QUANTS MK_CPPFLAGS += -DGGML_USE_K_QUANTS
CXXFLAGS += -DGGML_USE_K_QUANTS
OBJS += k_quants.o OBJS += k_quants.o
ifdef LLAMA_QKK_64 ifdef LLAMA_QKK_64
CFLAGS += -DGGML_QKK_64 MK_CPPFLAGS += -DGGML_QKK_64
CXXFLAGS += -DGGML_QKK_64
endif endif
endif endif
ifndef LLAMA_NO_ACCELERATE ifndef LLAMA_NO_ACCELERATE
# Mac M1 - include Accelerate framework. # Mac OS - include Accelerate framework.
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time). # `-framework Accelerate` works both with Apple Silicon and Mac Intel
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
CFLAGS += -DGGML_USE_ACCELERATE MK_CPPFLAGS += -DGGML_USE_ACCELERATE
LDFLAGS += -framework Accelerate MK_LDFLAGS += -framework Accelerate
endif endif
endif # LLAMA_NO_ACCELERATE endif # LLAMA_NO_ACCELERATE
ifdef LLAMA_MPI ifdef LLAMA_MPI
CFLAGS += -DGGML_USE_MPI -Wno-cast-qual MK_CPPFLAGS += -DGGML_USE_MPI
CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual MK_CFLAGS += -Wno-cast-qual
MK_CXXFLAGS += -Wno-cast-qual
OBJS += ggml-mpi.o OBJS += ggml-mpi.o
endif # LLAMA_MPI endif # LLAMA_MPI
ifdef LLAMA_OPENBLAS ifdef LLAMA_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags openblas) MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
LDFLAGS += $(shell pkg-config --libs openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
MK_LDFLAGS += $(shell pkg-config --libs openblas)
endif # LLAMA_OPENBLAS endif # LLAMA_OPENBLAS
ifdef LLAMA_BLIS ifdef LLAMA_BLIS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
LDFLAGS += -lblis -L/usr/local/lib MK_LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS endif # LLAMA_BLIS
ifdef LLAMA_CUBLAS ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
OBJS += ggml-cuda.o OBJS += ggml-cuda.o
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
ifdef LLAMA_CUDA_NVCC ifdef LLAMA_CUDA_NVCC
@ -313,14 +317,15 @@ endif # LLAMA_CUBLAS
ifdef LLAMA_CLBLAST ifdef LLAMA_CLBLAST
CFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL) MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
CXXFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL) MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
# Mac provides OpenCL as a framework # Mac provides OpenCL as a framework
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
LDFLAGS += -lclblast -framework OpenCL MK_LDFLAGS += -lclblast -framework OpenCL
else else
LDFLAGS += $(shell pkg-config --libs clblast OpenCL) MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
endif endif
OBJS += ggml-opencl.o OBJS += ggml-opencl.o
@ -335,10 +340,9 @@ ifdef LLAMA_HIPBLAS
LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_DMMV_X ?= 32
LLAMA_CUDA_MMV_Y ?= 1 LLAMA_CUDA_MMV_Y ?= 1
LLAMA_CUDA_KQUANTS_ITER ?= 2 LLAMA_CUDA_KQUANTS_ITER ?= 2
CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
LDFLAGS += -lhipblas -lamdhip64 -lrocblas
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
@ -353,10 +357,12 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
endif # LLAMA_HIPBLAS endif # LLAMA_HIPBLAS
ifdef LLAMA_METAL ifdef LLAMA_METAL
CFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG MK_CPPFLAGS += -DGGML_USE_METAL
CXXFLAGS += -DGGML_USE_METAL MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
OBJS += ggml-metal.o OBJS += ggml-metal.o
ifdef LLAMA_METAL_NDEBUG
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
endif
endif # LLAMA_METAL endif # LLAMA_METAL
ifdef LLAMA_METAL ifdef LLAMA_METAL
@ -369,11 +375,17 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI endif # LLAMA_MPI
ifdef LLAMA_NO_K_QUANTS ifndef LLAMA_NO_K_QUANTS
k_quants.o: k_quants.c k_quants.h k_quants.o: k_quants.c k_quants.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_NO_K_QUANTS endif # LLAMA_NO_K_QUANTS
# combine build flags with cmdline overrides
override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
override CFLAGS := $(MK_CFLAGS) $(CFLAGS)
override CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
# #
# Print build information # Print build information
# #
@ -417,7 +429,7 @@ libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean: clean:
rm -vf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h $(BUILD_TARGETS) $(TEST_TARGETS) rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
# #
# Examples # Examples
@ -475,9 +487,8 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS)
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS) beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))' speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
BUILD_TARGETS += metal $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
endif
ifdef LLAMA_METAL ifdef LLAMA_METAL
metal: examples/metal/metal.cpp ggml.o $(OBJS) metal: examples/metal/metal.cpp ggml.o $(OBJS)

View file

@ -12,9 +12,18 @@ let package = Package(
name: "llama", name: "llama",
path: ".", path: ".",
exclude: ["ggml-metal.metal"], exclude: ["ggml-metal.metal"],
sources: ["ggml.c", "llama.cpp"], sources: [
"ggml.c",
"llama.cpp",
"ggml-alloc.c",
"k_quants.c"
],
publicHeadersPath: "spm-headers", publicHeadersPath: "spm-headers",
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")], cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32"]),
.define("GGML_USE_K_QUANTS"),
.define("GGML_USE_ACCELERATE")
],
linkerSettings: [ linkerSettings: [
.linkedFramework("Accelerate") .linkedFramework("Accelerate")
] ]

View file

@ -120,6 +120,7 @@ as the main playground for developing new features for the [ggml](https://github
- [nat/openplayground](https://github.com/nat/openplayground) - [nat/openplayground](https://github.com/nat/openplayground)
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
- [withcatai/catai](https://github.com/withcatai/catai)
--- ---
@ -279,29 +280,11 @@ In order to build llama.cpp you have three different options.
### Metal Build ### Metal Build
Using Metal allows the computation to be executed on the GPU for Apple devices: On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
- Using `make`: When built with Metal support, you can explicitly disable GPU inference with the `--gpu-layers|-ngl 0` command-line
argument.
```bash
LLAMA_METAL=1 make
```
- Using `CMake`:
```bash
mkdir build-metal
cd build-metal
cmake -DLLAMA_METAL=ON ..
cmake --build . --config Release
```
When built with Metal support, you can enable GPU inference with the `--gpu-layers|-ngl` command-line argument.
Any value larger than 0 will offload the computation to the GPU. For example:
```bash
./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1
```
### MPI Build ### MPI Build
@ -464,6 +447,8 @@ Building the program with BLAS support may lead to some performance improvements
You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK). You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
- For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed. - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
- For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
- <details> - <details>
<summary>Installing the OpenCL SDK from source</summary> <summary>Installing the OpenCL SDK from source</summary>
@ -481,10 +466,27 @@ Building the program with BLAS support may lead to some performance improvements
``` ```
</details> </details>
Installing CLBlast: it may be found in your operating system's packages. ##### Installing CLBlast
Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
Alternatively, they may be built from source.
- <details> - <details>
<summary>If not, then installing from source:</summary> <summary>Windows:</summary>
```cmd
set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
git clone https://github.com/CNugteren/CLBlast.git
mkdir CLBlast\build
cd CLBlast\build
cmake .. -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
cmake --build . --config Release
cmake --install . --prefix C:/CLBlast
```
- <details>
<summary>Unix:</summary>
```sh ```sh
git clone https://github.com/CNugteren/CLBlast.git git clone https://github.com/CNugteren/CLBlast.git
@ -498,21 +500,32 @@ Building the program with BLAS support may lead to some performance improvements
Where `/some/path` is where the built library will be installed (default is `/usr/local`). Where `/some/path` is where the built library will be installed (default is `/usr/local`).
</details> </details>
Building: ##### Building Llama with CLBlast
- Build with make: - Build with make:
```sh ```sh
make LLAMA_CLBLAST=1 make LLAMA_CLBLAST=1
``` ```
- CMake: - CMake (Unix):
```sh ```sh
mkdir build mkdir build
cd build cd build
cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path
cmake --build . --config Release cmake --build . --config Release
``` ```
- CMake (Windows):
```cmd
set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
mkdir build
cd build
cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
cmake --build . --config Release
cmake --install . --prefix C:/LlamaCPP
```
Running: ##### Running Llama with CLBlast
The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does. The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.

14
codecov.yml Normal file
View file

@ -0,0 +1,14 @@
comment: off
coverage:
status:
project:
default:
target: auto
threshold: 0
base: auto
patch:
default:
target: auto
threshold: 0
base: auto

View file

@ -305,6 +305,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.n_keep = std::stoi(argv[i]); params.n_keep = std::stoi(argv[i]);
} else if (arg == "--draft") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_draft = std::stoi(argv[i]);
} else if (arg == "--chunks") { } else if (arg == "--chunks") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -317,6 +323,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.model = argv[i]; params.model = argv[i];
} else if (arg == "-md" || arg == "--model-draft") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.model_draft = argv[i];
} else if (arg == "-a" || arg == "--alias") { } else if (arg == "-a" || arg == "--alias") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -572,106 +584,109 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
} }
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stdout, "usage: %s [options]\n", argv[0]); printf("usage: %s [options]\n", argv[0]);
fprintf(stdout, "\n"); printf("\n");
fprintf(stdout, "options:\n"); printf("options:\n");
fprintf(stdout, " -h, --help show this help message and exit\n"); printf(" -h, --help show this help message and exit\n");
fprintf(stdout, " -i, --interactive run in interactive mode\n"); printf(" -i, --interactive run in interactive mode\n");
fprintf(stdout, " --interactive-first run in interactive mode and wait for input right away\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n");
fprintf(stdout, " -ins, --instruct run in instruction mode (use with Alpaca models)\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
fprintf(stdout, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
fprintf(stdout, " -r PROMPT, --reverse-prompt PROMPT\n"); printf(" -r PROMPT, --reverse-prompt PROMPT\n");
fprintf(stdout, " halt generation at PROMPT, return control in interactive mode\n"); printf(" halt generation at PROMPT, return control in interactive mode\n");
fprintf(stdout, " (can be specified more than once for multiple prompts).\n"); printf(" (can be specified more than once for multiple prompts).\n");
fprintf(stdout, " --color colorise output to distinguish prompt and user input from generations\n"); printf(" --color colorise output to distinguish prompt and user input from generations\n");
fprintf(stdout, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stdout, " -p PROMPT, --prompt PROMPT\n"); printf(" -p PROMPT, --prompt PROMPT\n");
fprintf(stdout, " prompt to start generation with (default: empty)\n"); printf(" prompt to start generation with (default: empty)\n");
fprintf(stdout, " -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
fprintf(stdout, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); printf(" --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
fprintf(stdout, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); printf(" --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
fprintf(stdout, " not supported with --interactive or other interactive options\n"); printf(" not supported with --interactive or other interactive options\n");
fprintf(stdout, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n"); printf(" --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
fprintf(stdout, " --random-prompt start with a randomized prompt.\n"); printf(" --random-prompt start with a randomized prompt.\n");
fprintf(stdout, " --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n"); printf(" --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n");
fprintf(stdout, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); printf(" --in-prefix STRING string to prefix user inputs with (default: empty)\n");
fprintf(stdout, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
fprintf(stdout, " -f FNAME, --file FNAME\n"); printf(" -f FNAME, --file FNAME\n");
fprintf(stdout, " prompt file to start generation.\n"); printf(" prompt file to start generation.\n");
fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
fprintf(stdout, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p); printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
fprintf(stdout, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
fprintf(stdout, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
fprintf(stdout, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
fprintf(stdout, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
fprintf(stdout, " --mirostat N use Mirostat sampling.\n"); printf(" --mirostat N use Mirostat sampling.\n");
fprintf(stdout, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
fprintf(stdout, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
fprintf(stdout, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
fprintf(stdout, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
fprintf(stdout, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
fprintf(stdout, " modifies the likelihood of token appearing in the completion,\n"); printf(" modifies the likelihood of token appearing in the completion,\n");
fprintf(stdout, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
fprintf(stdout, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
fprintf(stdout, " --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n"); printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
fprintf(stdout, " --grammar-file FNAME file to read grammar from\n"); printf(" --grammar-file FNAME file to read grammar from\n");
fprintf(stdout, " --cfg-negative-prompt PROMPT\n"); printf(" --cfg-negative-prompt PROMPT\n");
fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n"); printf(" negative prompt to use for guidance. (default: empty)\n");
fprintf(stdout, " --cfg-negative-prompt-file FNAME\n"); printf(" --cfg-negative-prompt-file FNAME\n");
fprintf(stdout, " negative prompt file to use for guidance. (default: empty)\n"); printf(" negative prompt file to use for guidance. (default: empty)\n");
fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
fprintf(stdout, " --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale); printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
fprintf(stdout, " --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base); printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
fprintf(stdout, " --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale); printf(" --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
fprintf(stdout, " --no-penalize-nl do not penalize newline token\n"); printf(" --no-penalize-nl do not penalize newline token\n");
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp); printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n"); printf(" --perplexity compute perplexity over each ctx window of the prompt\n");
fprintf(stdout, " --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
fprintf(stdout, " --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
fprintf(stdout, " --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
if (llama_mlock_supported()) { if (llama_mlock_supported()) {
fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
} }
if (llama_mmap_supported()) { if (llama_mmap_supported()) {
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
} }
fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n"); printf(" --numa attempt optimizations that help on some NUMA systems\n");
fprintf(stdout, " if run without this previously, it is recommended to drop the system page cache before using this\n"); printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
fprintf(stdout, " see https://github.com/ggerganov/llama.cpp/issues/1437\n"); printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stdout, " -ngl N, --n-gpu-layers N\n"); printf(" -ngl N, --n-gpu-layers N\n");
fprintf(stdout, " number of layers to store in VRAM\n"); printf(" number of layers to store in VRAM\n");
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n"); printf(" -ts SPLIT --tensor-split SPLIT\n");
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n");
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
fprintf(stdout, " -nommq, --no-mul-mat-q\n"); printf(" -nommq, --no-mul-mat-q\n");
fprintf(stdout, " use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n"); printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n"); printf(" Not recommended since this is both slower and uses more VRAM.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
#endif #endif
fprintf(stdout, " --mtest compute maximum memory usage\n"); printf(" --mtest compute maximum memory usage\n");
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n"); printf(" --export export the computation graph to 'llama.ggml'\n");
fprintf(stdout, " --verbose-prompt print prompt before generation\n"); printf(" --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stdout, " -m FNAME, --model FNAME\n"); printf(" -m FNAME, --model FNAME\n");
fprintf(stdout, " model path (default: %s)\n", params.model.c_str()); printf(" model path (default: %s)\n", params.model.c_str());
fprintf(stdout, " -ld LOGDIR, --logdir LOGDIR\n"); printf(" -md FNAME, --model-draft FNAME\n");
fprintf(stdout, " path under which to save YAML logs (no logging if unset)\n"); printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
fprintf(stdout, "\n"); printf(" -ld LOGDIR, --logdir LOGDIR\n");
printf(" path under which to save YAML logs (no logging if unset)\n");
printf("\n");
} }
std::string gpt_random_prompt(std::mt19937 & rng) { std::string gpt_random_prompt(std::mt19937 & rng) {
@ -702,7 +717,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
lparams.n_ctx = params.n_ctx; lparams.n_ctx = params.n_ctx;
lparams.n_batch = params.n_batch; lparams.n_batch = params.n_batch;
if (params.n_gpu_layers != -1) {
lparams.n_gpu_layers = params.n_gpu_layers; lparams.n_gpu_layers = params.n_gpu_layers;
}
lparams.main_gpu = params.main_gpu; lparams.main_gpu = params.main_gpu;
lparams.tensor_split = params.tensor_split; lparams.tensor_split = params.tensor_split;
lparams.low_vram = params.low_vram; lparams.low_vram = params.low_vram;
@ -752,6 +769,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
params.logit_bias[llama_token_eos(lctx)] = -INFINITY; params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
} }
{
LOG("warming up the model with an empty run\n");
const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
llama_eval(lctx, tmp.data(), tmp.size(), 0, params.n_threads);
llama_reset_timings(lctx);
}
return std::make_tuple(model, lctx); return std::make_tuple(model, lctx);
} }
@ -824,6 +849,130 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
return result; return result;
} }
//
// Sampling utils
//
llama_token llama_sample_token(
struct llama_context * ctx,
struct llama_context * ctx_guidance,
struct llama_grammar * grammar,
const struct gpt_params & params,
const std::vector<llama_token> & last_tokens,
std::vector<llama_token_data> & candidates,
int idx) {
const int n_ctx = llama_n_ctx(ctx);
const int n_vocab = llama_n_vocab(ctx);
const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
const float top_p = params.top_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
const float repeat_penalty = params.repeat_penalty;
const float alpha_presence = params.presence_penalty;
const float alpha_frequency = params.frequency_penalty;
const int mirostat = params.mirostat;
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
const bool penalize_nl = params.penalize_nl;
llama_token id = 0;
float * logits = llama_get_logits(ctx) + idx * n_vocab;
// Apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
candidates.clear();
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
if (ctx_guidance) {
llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
}
// apply penalties
if (!last_tokens.empty()) {
const float nl_logit = logits[llama_token_nl(ctx)];
const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
llama_sample_repetition_penalty(ctx, &cur_p,
last_tokens.data() + last_tokens.size() - last_n_repeat,
last_n_repeat, repeat_penalty);
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
last_tokens.data() + last_tokens.size() - last_n_repeat,
last_n_repeat, alpha_frequency, alpha_presence);
if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
cur_p.data[idx].logit = nl_logit;
break;
}
}
}
}
if (grammar != NULL) {
llama_sample_grammar(ctx, &cur_p, grammar);
}
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx, &cur_p);
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temperature(ctx, &cur_p, temp);
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temperature(ctx, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
llama_sample_top_k (ctx, &cur_p, top_k, 1);
llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
llama_sample_typical (ctx, &cur_p, typical_p, 1);
llama_sample_top_p (ctx, &cur_p, top_p, 1);
llama_sample_temperature(ctx, &cur_p, temp);
{
const int n_top = 10;
LOG("top %d candidates:\n", n_top);
for (int i = 0; i < n_top; i++) {
const llama_token id = cur_p.data[i].id;
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
}
}
id = llama_sample_token(ctx, &cur_p);
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
}
}
// printf("`%d`", candidates_p.size);
if (grammar != NULL) {
llama_grammar_accept_token(ctx, grammar, id);
}
return id;
}
//
// YAML utils
//
// returns true if successful, false otherwise // returns true if successful, false otherwise
bool create_directory_with_parents(const std::string & path) { bool create_directory_with_parents(const std::string & path) {
#ifdef _WIN32 #ifdef _WIN32
@ -1062,9 +1211,10 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta); fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str()); fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false"); fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
fprintf(stream, "n_gpu_layers: %d # default: 0\n", params.n_gpu_layers); fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs); fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");

View file

@ -32,8 +32,9 @@ struct gpt_params {
int32_t n_ctx = 512; // context size int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_gpu_layers = 0; // number of layers to store in VRAM int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
@ -63,6 +64,7 @@ struct gpt_params {
float cfg_scale = 1.f; // How strong is guidance float cfg_scale = 1.f; // How strong is guidance
std::string model = "models/7B/ggml-model-f16.gguf"; // model path std::string model = "models/7B/ggml-model-f16.gguf"; // model path
std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias std::string model_alias = "unknown"; // model alias
std::string prompt = ""; std::string prompt = "";
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
@ -156,6 +158,40 @@ std::string llama_detokenize_bpe(
llama_context * ctx, llama_context * ctx,
const std::vector<llama_token> & tokens); const std::vector<llama_token> & tokens);
//
// Sampling utils
//
// this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function
//
// required:
// - ctx: context to use for sampling
// - params: sampling parameters
//
// optional:
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
// - grammar: grammar to use for sampling, ignore if NULL
// - last_tokens: needed for repetition penalty, ignore if empty
// - idx: sample from llama_get_logits(ctx) + idx * n_vocab
//
// returns:
// - token: sampled token
// - candidates: vector of candidate tokens
//
llama_token llama_sample_token(
struct llama_context * ctx,
struct llama_context * ctx_guidance,
struct llama_grammar * grammar,
const struct gpt_params & params,
const std::vector<llama_token> & last_tokens,
std::vector<llama_token_data> & candidates,
int idx = 0);
//
// YAML utils
//
bool create_directory_with_parents(const std::string & path); bool create_directory_with_parents(const std::string & path);
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data); void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data); void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);

View file

@ -341,14 +341,14 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri
} }
} }
if (_initialized)
{
if (_disabled) if (_disabled)
{ {
// Log is disabled // Log is disabled
return nullptr; return nullptr;
} }
if (_initialized)
{
// with fallback in case something went wrong // with fallback in case something went wrong
return logfile ? logfile : stderr; return logfile ? logfile : stderr;
} }
@ -513,16 +513,16 @@ inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string &
inline void log_print_usage() inline void log_print_usage()
{ {
fprintf(stdout, "log options:\n"); printf("log options:\n");
/* format /* format
fprintf(stdout, " -h, --help show this help message and exit\n");*/ printf(" -h, --help show this help message and exit\n");*/
/* spacing /* spacing
fprintf(stdout, "__-param----------------Description\n");*/ printf("__-param----------------Description\n");*/
fprintf(stdout, " --log-test Run simple logging test\n"); printf(" --log-test Run simple logging test\n");
fprintf(stdout, " --log-disable Disable trace logs\n"); printf(" --log-disable Disable trace logs\n");
fprintf(stdout, " --log-enable Enable trace logs\n"); printf(" --log-enable Enable trace logs\n");
fprintf(stdout, " --log-file Specify a log filename (without extension)\n"); printf(" --log-file Specify a log filename (without extension)\n");
fprintf(stdout, " Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */ printf(" Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */
} }
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv) #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)

View file

@ -58,7 +58,7 @@ def parse_args() -> argparse.Namespace:
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1) parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
return parser.parse_args() return parser.parse_args()
args = parse_args() args = parse_args()

View file

@ -5,6 +5,7 @@ import argparse
import math import math
import struct import struct
import sys import sys
from enum import IntEnum
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
@ -34,10 +35,35 @@ GGML_QUANT_SIZES = {
gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8), gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
} }
class GGMLFormat(IntEnum):
GGML = 0
GGMF = 1
GGJT = 2
class GGMLFType(IntEnum):
ALL_F32 = 0
MOSTLY_F16 = 1
MOSTLY_Q4_0 = 2
MOSTLY_Q4_1 = 3
MOSTLY_Q4_1_SOME_F16 = 4
MOSTLY_Q8_0 = 7
MOSTLY_Q5_0 = 8
MOSTLY_Q5_1 = 9
MOSTLY_Q2_K = 10
MOSTLY_Q3_K_S = 11
MOSTLY_Q3_K_M = 12
MOSTLY_Q3_K_L = 13
MOSTLY_Q4_K_S = 14
MOSTLY_Q4_K_M = 15
MOSTLY_Q5_K_S = 16
MOSTLY_Q5_K_M = 17
MOSTLY_Q6_K = 18
class Hyperparameters: class Hyperparameters:
def __init__(self): def __init__(self):
self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0 self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
self.n_ff = 0 self.n_layer = self.n_rot = self.n_ff = 0
self.ftype = GGMLFType.ALL_F32
def set_n_ff(self, model): def set_n_ff(self, model):
ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight') ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
@ -53,16 +79,21 @@ class Hyperparameters:
self.n_head, self.n_head,
self.n_layer, self.n_layer,
self.n_rot, self.n_rot,
self.ftype, ftype,
) = struct.unpack('<7I', data[offset:offset + (4 * 7)]) ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
try:
self.ftype = GGMLFType(ftype)
except ValueError:
raise ValueError(f'Invalid ftype {ftype}')
return 4 * 7 return 4 * 7
def __str__(self): def __str__(self):
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>' return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
class Vocab: class Vocab:
def __init__(self): def __init__(self, load_scores = True):
self.items = [] self.items = []
self.load_scores = load_scores
def load(self, data, offset, n_vocab): def load(self, data, offset, n_vocab):
orig_offset = offset orig_offset = offset
@ -70,20 +101,24 @@ class Vocab:
itemlen = struct.unpack('<I', data[offset:offset + 4])[0] itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
assert itemlen < 4096, 'Absurd vocab item length' assert itemlen < 4096, 'Absurd vocab item length'
offset += 4 offset += 4
vocab = bytes(data[offset:offset + itemlen]) item_text = bytes(data[offset:offset + itemlen])
offset += itemlen offset += itemlen
score = struct.unpack('<f', data[offset:offset + 4])[0] if self.load_scores:
item_score = struct.unpack('<f', data[offset:offset + 4])[0]
offset += 4 offset += 4
self.items.append((vocab, score)) else:
item_score = 0.0
self.items.append((item_text, item_score))
return offset - orig_offset return offset - orig_offset
class Tensor: class Tensor:
def __init__(self): def __init__(self, use_padding = True):
self.name = None self.name = None
self.dims: tuple[int, ...] = () self.dims: tuple[int, ...] = ()
self.dtype = None self.dtype = None
self.start_offset = 0 self.start_offset = 0
self.len_bytes = np.int64(0) self.len_bytes = np.int64(0)
self.use_padding = use_padding
def load(self, data, offset): def load(self, data, offset):
orig_offset = offset orig_offset = offset
@ -99,7 +134,7 @@ class Tensor:
offset += 4 * n_dims offset += 4 * n_dims
self.name = bytes(data[offset:offset + name_len]) self.name = bytes(data[offset:offset + name_len])
offset += name_len offset += name_len
pad = ((offset + 31) & ~31) - offset pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
offset += pad offset += pad
n_elems = np.prod(self.dims) n_elems = np.prod(self.dims)
n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize) n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
@ -109,7 +144,7 @@ class Tensor:
# print(n_dims, name_len, dtype, self.dims, self.name, pad) # print(n_dims, name_len, dtype, self.dims, self.name, pad)
return offset - orig_offset return offset - orig_offset
class GGMLV3Model: class GGMLModel:
def __init__(self): def __init__(self):
self.hyperparameters = None self.hyperparameters = None
self.vocab = None self.vocab = None
@ -117,20 +152,52 @@ class GGMLV3Model:
self.tensors = [] self.tensors = []
def validate_header(self, data, offset): def validate_header(self, data, offset):
if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3: magic = bytes(data[offset:offset + 4])
raise ValueError('Only GGJTv3 supported') if magic == b'GGUF':
raise ValueError('File is already in GGUF format.')
if magic == b'lmgg':
self.file_format = GGMLFormat.GGML
self.format_version = 1
return 4
version = struct.unpack('<I', data[offset + 4:offset + 8])[0]
if magic == b'fmgg':
if version != 1:
raise ValueError(f'Cannot handle unexpected GGMF file version {version}')
self.file_format = GGMLFormat.GGMF
self.format_version = version
return 8 return 8
if magic == b'tjgg':
if version < 1 or version > 3:
raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
self.file_format = GGMLFormat.GGJT
self.format_version = version
return 8
raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
def validate_conversion(self, ftype):
err = ''
if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
if len(err) > 0:
raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
def load(self, data, offset): def load(self, data, offset):
offset += self.validate_header(data, offset) offset += self.validate_header(data, offset)
hp = Hyperparameters() hp = Hyperparameters()
offset += hp.load(data, offset) offset += hp.load(data, offset)
vocab = Vocab() print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
self.validate_conversion(hp.ftype)
vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
offset += vocab.load(data, offset, hp.n_vocab) offset += vocab.load(data, offset, hp.n_vocab)
tensors: list[Tensor] = [] tensors: list[Tensor] = []
tensor_map = {} tensor_map = {}
while offset < len(data): while offset < len(data):
tensor = Tensor() tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
offset += tensor.load(data, offset) offset += tensor.load(data, offset)
tensor_map[tensor.name] = len(tensors) tensor_map[tensor.name] = len(tensors)
tensors.append(tensor) tensors.append(tensor)
@ -168,7 +235,10 @@ class GGMLToGGUF:
def save(self): def save(self):
print('* Preparing to save GGUF file') print('* Preparing to save GGUF file')
gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False) gguf_writer = gguf.GGUFWriter(
self.cfg.output,
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
use_temp_file = False )
self.add_params(gguf_writer) self.add_params(gguf_writer)
self.add_vocab(gguf_writer) self.add_vocab(gguf_writer)
if self.special_vocab is not None: if self.special_vocab is not None:
@ -185,7 +255,10 @@ class GGMLToGGUF:
def add_params(self, gguf_writer): def add_params(self, gguf_writer):
hp = self.model.hyperparameters hp = self.model.hyperparameters
cfg = self.cfg cfg = self.cfg
desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format' if cfg.desc is not None:
desc = cfg.desc
else:
desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
try: try:
# Filenames aren't necessarily valid UTF8. # Filenames aren't necessarily valid UTF8.
name = cfg.name if cfg.name is not None else cfg.input.name name = cfg.name if cfg.name is not None else cfg.input.name
@ -195,6 +268,7 @@ class GGMLToGGUF:
if name is not None: if name is not None:
gguf_writer.add_name(name) gguf_writer.add_name(name)
gguf_writer.add_description(desc) gguf_writer.add_description(desc)
gguf_writer.add_file_type(int(hp.ftype))
if self.params_override is not None: if self.params_override is not None:
po = self.params_override po = self.params_override
assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch' assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
@ -231,7 +305,8 @@ class GGMLToGGUF:
tokens.append(vbytes) tokens.append(vbytes)
scores.append(score) scores.append(score)
toktypes.append(ttype) toktypes.append(ttype)
assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}' assert len(tokens) == hp.n_vocab, \
f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
gguf_writer.add_token_list(tokens) gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores) gguf_writer.add_token_scores(scores)
if len(toktypes) > 0: if len(toktypes) > 0:
@ -283,7 +358,11 @@ class GGMLToGGUF:
tempdims[1] = tempdims[0] tempdims[1] = tempdims[0]
tempdims[0] = temp tempdims[0] = temp
# print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}') # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype) gguf_writer.add_tensor(
mapped_name,
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
raw_shape = tempdims,
raw_dtype = tensor.dtype )
def handle_metadata(cfg, hp): def handle_metadata(cfg, hp):
import convert import convert
@ -305,32 +384,46 @@ def handle_metadata(cfg, hp):
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path) params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
else: else:
raise ValueError('Unable to load metadata') raise ValueError('Unable to load metadata')
vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype) vocab = convert.load_vocab(
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
cfg.vocabtype )
# FIXME: Respect cfg.vocab_dir? # FIXME: Respect cfg.vocab_dir?
svocab = gguf.SpecialVocab(cfg.model_metadata_dir) svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
convert.check_vocab_size(params, vocab) convert.check_vocab_size(params, vocab)
return (params, vocab, svocab) return (params, vocab, svocab)
def handle_args(): def handle_args():
parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF') parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
parser.add_argument('--input', '-i', type = Path, required = True, help = 'Input GGMLv3 filename') parser.add_argument('--input', '-i', type = Path, required = True,
parser.add_argument('--output', '-o', type = Path, required = True, help ='Output GGUF filename') help = 'Input GGMLv3 filename')
parser.add_argument('--name', help = 'Set model name') parser.add_argument('--output', '-o', type = Path, required = True,
parser.add_argument('--desc', help = 'Set model description') help ='Output GGUF filename')
parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)') parser.add_argument('--name',
parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2') help = 'Set model name')
parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096') parser.add_argument('--desc',
parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory') help = 'Set model description')
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") parser.add_argument('--gqa', type = int, default = 1,
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm") help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
parser.add_argument('--eps', default = '5.0e-06',
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
parser.add_argument('--context-length', '-c', type=int, default = 2048,
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
parser.add_argument('--model-metadata-dir', '-m', type = Path,
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
parser.add_argument("--vocab-dir", type=Path,
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
return parser.parse_args() return parser.parse_args()
def main(): def main():
cfg = handle_args() cfg = handle_args()
print(f'* Using config: {cfg}') print(f'* Using config: {cfg}')
print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n') print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
data = np.memmap(cfg.input, mode = 'r') data = np.memmap(cfg.input, mode = 'r')
model = GGMLV3Model() model = GGMLModel()
print('* Scanning GGML input file') print('* Scanning GGML input file')
offset = model.load(data, 0) offset = model.load(data, 0)
print(f'* GGML model hyperparameters: {model.hyperparameters}') print(f'* GGML model hyperparameters: {model.hyperparameters}')
@ -345,7 +438,12 @@ def main():
print(f'* Special vocab: {special_vocab}') print(f'* Special vocab: {special_vocab}')
else: else:
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override, special_vocab = special_vocab) if model.file_format == GGMLFormat.GGML:
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
converter = GGMLToGGUF(model, data, cfg,
params_override = params_override,
vocab_override = vocab_override,
special_vocab = special_vocab )
converter.save() converter.save()
print(f'* Successful completion. Output saved to: {cfg.output}') print(f'* Successful completion. Output saved to: {cfg.output}')

View file

@ -323,15 +323,27 @@ class BpeVocab:
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
added_tokens: dict[str, int] added_tokens: dict[str, int]
if fname_added_tokens is not None: if fname_added_tokens is not None:
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else: else:
# Fall back to trying to find the added tokens in tokenizer.json
tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
if not tokenizer_json_file.is_file():
added_tokens = {} added_tokens = {}
else:
tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
added_tokens = dict(
(item['content'], item['id'])
for item in tokenizer_json.get('added_tokens', [])
# Added tokens here can be duplicates of the main vocabulary.
if item['content'] not in self.bpe_tokenizer )
vocab_size: int = len(self.bpe_tokenizer) vocab_size: int = len(self.bpe_tokenizer)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values()) actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids: if expected_ids != actual_ids:
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}") expected_end_id = vocab_size + len(actual_ids) - 1
raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_list = [text for (text, idx) in items] self.added_tokens_list = [text for (text, idx) in items]
@ -345,10 +357,22 @@ class BpeVocab:
from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import] from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import]
byte_encoder = tokenization_gpt2.bytes_to_unicode() byte_encoder = tokenization_gpt2.bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()} byte_decoder = {v: k for k, v in byte_encoder.items()}
score = 0.0
for i, item in enumerate(tokenizer): for i, item in enumerate(tokenizer):
text: bytes = item.encode("utf-8") text: bytes = item.encode("utf-8")
score: float = -i # FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior?
yield text, score, gguf.TokenType.USER_DEFINED if i <= 258 and text.startswith(b'<') and text.endswith(b'>'):
if i == 0 and text == b'<unk>':
toktype = gguf.TokenType.UNKNOWN
elif i == 1 or i == 2:
toktype = gguf.TokenType.CONTROL
elif i >= 3 and text.startswith(b'<0x'):
toktype = gguf.TokenType.BYTE
else:
toktype = gguf.TokenType.NORMAL
else:
toktype = gguf.TokenType.NORMAL
yield text, score, toktype
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list: for text in self.added_tokens_list:
@ -649,7 +673,7 @@ class LazyUnpickler(pickle.Unpickler):
assert isinstance(pid[1], LazyStorageKind) assert isinstance(pid[1], LazyStorageKind)
data_type = pid[1].data_type data_type = pid[1].data_type
filename_stem = pid[2] filename_stem = pid[2]
filename = self.data_base_path + '/' + filename_stem filename = f'{self.data_base_path}/{filename_stem}'
info = self.zip_file.getinfo(filename) info = self.zip_file.getinfo(filename)
def load(offset: int, elm_count: int) -> NDArray: def load(offset: int, elm_count: int) -> NDArray:
@ -665,7 +689,6 @@ class LazyUnpickler(pickle.Unpickler):
@staticmethod @staticmethod
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
# pyright: ignore[reportSelfClsParameterName]
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor: requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
assert isinstance(storage, LazyStorage) assert isinstance(storage, LazyStorage)

View file

@ -23,6 +23,7 @@ else()
add_subdirectory(train-text-from-scratch) add_subdirectory(train-text-from-scratch)
add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(simple) add_subdirectory(simple)
add_subdirectory(speculative)
add_subdirectory(embd-input) add_subdirectory(embd-input)
add_subdirectory(llama-bench) add_subdirectory(llama-bench)
add_subdirectory(beam-search) add_subdirectory(beam-search)

View file

@ -76,7 +76,7 @@ bool gguf_ex_write(const std::string & fname) {
gguf_write_to_file(ctx, fname.c_str(), false); gguf_write_to_file(ctx, fname.c_str(), false);
fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str()); printf("%s: wrote file '%s;\n", __func__, fname.c_str());
ggml_free(ctx_data); ggml_free(ctx_data);
gguf_free(ctx); gguf_free(ctx);
@ -93,20 +93,20 @@ bool gguf_ex_read_0(const std::string & fname) {
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
fprintf(stdout, "%s: version: %d\n", __func__, gguf_get_version(ctx)); printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
fprintf(stdout, "%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx)); printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
// kv // kv
{ {
const int n_kv = gguf_get_n_kv(ctx); const int n_kv = gguf_get_n_kv(ctx);
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv); printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) { for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ctx, i); const char * key = gguf_get_key(ctx, i);
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key); printf("%s: kv[%d]: key = %s\n", __func__, i, key);
} }
} }
@ -116,10 +116,10 @@ bool gguf_ex_read_0(const std::string & fname) {
const int keyidx = gguf_find_key(ctx, findkey); const int keyidx = gguf_find_key(ctx, findkey);
if (keyidx == -1) { if (keyidx == -1) {
fprintf(stdout, "%s: find key: %s not found.\n", __func__, findkey); printf("%s: find key: %s not found.\n", __func__, findkey);
} else { } else {
const char * key_value = gguf_get_val_str(ctx, keyidx); const char * key_value = gguf_get_val_str(ctx, keyidx);
fprintf(stdout, "%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value); printf("%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
} }
} }
@ -127,13 +127,13 @@ bool gguf_ex_read_0(const std::string & fname) {
{ {
const int n_tensors = gguf_get_n_tensors(ctx); const int n_tensors = gguf_get_n_tensors(ctx);
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors); printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ctx, i); const char * name = gguf_get_tensor_name (ctx, i);
const size_t offset = gguf_get_tensor_offset(ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i);
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
} }
} }
@ -153,20 +153,20 @@ bool gguf_ex_read_1(const std::string & fname) {
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
fprintf(stdout, "%s: version: %d\n", __func__, gguf_get_version(ctx)); printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
fprintf(stdout, "%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx)); printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
// kv // kv
{ {
const int n_kv = gguf_get_n_kv(ctx); const int n_kv = gguf_get_n_kv(ctx);
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv); printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) { for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ctx, i); const char * key = gguf_get_key(ctx, i);
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key); printf("%s: kv[%d]: key = %s\n", __func__, i, key);
} }
} }
@ -174,13 +174,13 @@ bool gguf_ex_read_1(const std::string & fname) {
{ {
const int n_tensors = gguf_get_n_tensors(ctx); const int n_tensors = gguf_get_n_tensors(ctx);
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors); printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ctx, i); const char * name = gguf_get_tensor_name (ctx, i);
const size_t offset = gguf_get_tensor_offset(ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i);
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
} }
} }
@ -189,13 +189,13 @@ bool gguf_ex_read_1(const std::string & fname) {
const int n_tensors = gguf_get_n_tensors(ctx); const int n_tensors = gguf_get_n_tensors(ctx);
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
fprintf(stdout, "%s: reading tensor %d data\n", __func__, i); printf("%s: reading tensor %d data\n", __func__, i);
const char * name = gguf_get_tensor_name(ctx, i); const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data); printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
// print first 10 elements // print first 10 elements
const float * data = (const float *) cur->data; const float * data = (const float *) cur->data;
@ -219,7 +219,7 @@ bool gguf_ex_read_1(const std::string & fname) {
} }
} }
fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data)); printf("%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
ggml_free(ctx_data); ggml_free(ctx_data);
gguf_free(ctx); gguf_free(ctx);
@ -229,7 +229,7 @@ bool gguf_ex_read_1(const std::string & fname) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
if (argc < 3) { if (argc < 3) {
fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]); printf("usage: %s data.gguf r|w\n", argv[0]);
return -1; return -1;
} }

View file

@ -305,9 +305,9 @@ struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name)
struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str()); struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
if( cur == NULL ) { if( cur == NULL ) {
fprintf(stdout, "%s: tensor '%s' not found!\n", __func__, name.c_str()); printf("%s: tensor '%s' not found!\n", __func__, name.c_str());
} else { } else {
// fprintf(stdout, "%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name); // printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
} }
return cur; return cur;
@ -333,21 +333,21 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
return false; return false;
} }
fprintf(stdout, "%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx)); printf("%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx));
fprintf(stdout, "%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx)); printf("%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx));
fprintf(stdout, "%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx)); printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
// print all kv // print all kv
#if 0 #if 0
{ {
const int n_kv = gguf_get_n_kv(ggufctx); const int n_kv = gguf_get_n_kv(ggufctx);
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv); printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) { for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ggufctx, i); const char * key = gguf_get_key(ggufctx, i);
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key); printf("%s: kv[%d]: key = %s\n", __func__, i, key);
} }
} }
#endif #endif
@ -357,21 +357,21 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
int keyidx; int keyidx;
keyidx = gguf_find_key(ggufctx, "general.name"); keyidx = gguf_find_key(ggufctx, "general.name");
if (keyidx != -1) { fprintf(stdout, "%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.description"); keyidx = gguf_find_key(ggufctx, "general.description");
if (keyidx != -1) { fprintf(stdout, "%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.author"); keyidx = gguf_find_key(ggufctx, "general.author");
if (keyidx != -1) { fprintf(stdout, "%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.license"); keyidx = gguf_find_key(ggufctx, "general.license");
if (keyidx != -1) { fprintf(stdout, "%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.architecture"); keyidx = gguf_find_key(ggufctx, "general.architecture");
if (keyidx != -1) { fprintf(stdout, "%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.file_type"); keyidx = gguf_find_key(ggufctx, "general.file_type");
if (keyidx != -1) { fprintf(stdout, "%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
if (keyidx != -1) { fprintf(stdout, "%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository"); keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
} }
// check required metadata // check required metadata
@ -382,11 +382,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
keyidx = gguf_find_key(ggufctx, "general.architecture"); keyidx = gguf_find_key(ggufctx, "general.architecture");
if (keyidx != -1) { if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "falcon") != 0) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "falcon") != 0) {
fprintf(stdout, "%s: model architecture not supported!\n", __func__); printf("%s: model architecture not supported!\n", __func__);
return false; return false;
} }
} else { } else {
fprintf(stdout, "%s: gguf model architecture not found!\n", __func__); printf("%s: gguf model architecture not found!\n", __func__);
return false; return false;
} }
@ -394,11 +394,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
keyidx = gguf_find_key(ggufctx, "falcon.tensor_data_layout"); keyidx = gguf_find_key(ggufctx, "falcon.tensor_data_layout");
if (keyidx != -1) { if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "jploski") != 0) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "jploski") != 0) {
fprintf(stdout, "%s: model tensor data layout not supported!\n", __func__); printf("%s: model tensor data layout not supported!\n", __func__);
return false; return false;
} }
} else { } else {
fprintf(stdout, "%s: gguf model tensor data layout not found!\n", __func__); printf("%s: gguf model tensor data layout not found!\n", __func__);
return false; return false;
} }
@ -455,11 +455,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
if (keyidx != -1) { if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
fprintf(stdout, "%s: tokenizer model not supported!\n", __func__); printf("%s: tokenizer model not supported!\n", __func__);
return false; return false;
} }
} else { } else {
fprintf(stdout, "%s: tokenizer model not found!\n", __func__); printf("%s: tokenizer model not found!\n", __func__);
return false; return false;
} }
@ -467,22 +467,22 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens"); int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
if (tokens_keyidx == -1) { if (tokens_keyidx == -1) {
fprintf(stdout, "%s: gpt2 tokenizer vocab not found!\n", __func__); printf("%s: gpt2 tokenizer vocab not found!\n", __func__);
return false; return false;
} }
int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges"); int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");
if (merges_keyidx == -1) { if (merges_keyidx == -1) {
fprintf(stdout, "%s: gpt2 tokenizer merges not found!\n", __func__); printf("%s: gpt2 tokenizer merges not found!\n", __func__);
return false; return false;
} }
hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx); hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx); hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);
fprintf(stdout, "%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab); printf("%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab);
fprintf(stdout, "%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges); printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
for (size_t i = 0; i < hparams.n_vocab; i++) { for (size_t i = 0; i < hparams.n_vocab; i++) {
std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i); std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
@ -523,12 +523,12 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
if( vocab.special_bos_id != -1 ) { fprintf(stdout, "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); } if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
if( vocab.special_eos_id != -1 ) { fprintf(stdout, "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); } if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
if( vocab.special_unk_id != -1 ) { fprintf(stdout, "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); } if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
if( vocab.special_sep_id != -1 ) { fprintf(stdout, "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); } if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
if( vocab.special_pad_id != -1 ) { fprintf(stdout, "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); } if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
if( vocab.linefeed_id != -1 ) { fprintf(stdout, "%s: LF token = %d\n", __func__, vocab.linefeed_id ); } if( vocab.linefeed_id != -1 ) { printf("%s: LF token = %d\n", __func__, vocab.linefeed_id ); }
} }
@ -543,13 +543,13 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
{ {
const int n_tensors = gguf_get_n_tensors(ggufctx); const int n_tensors = gguf_get_n_tensors(ggufctx);
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors); printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ggufctx, i); const char * name = gguf_get_tensor_name (ggufctx, i);
const size_t offset = gguf_get_tensor_offset(ggufctx, i); const size_t offset = gguf_get_tensor_offset(ggufctx, i);
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
} }
} }
#endif #endif

View file

@ -318,9 +318,9 @@ struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name)
struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str()); struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
if( cur == NULL ) { if( cur == NULL ) {
fprintf(stdout, "%s: tensor '%s' not found!\n", __func__, name.c_str()); printf("%s: tensor '%s' not found!\n", __func__, name.c_str());
} else { } else {
// fprintf(stdout, "%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name); // printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
} }
return cur; return cur;
@ -346,21 +346,21 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
return false; return false;
} }
fprintf(stdout, "%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx)); printf("%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx));
fprintf(stdout, "%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx)); printf("%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx));
fprintf(stdout, "%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx)); printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
// print all kv // print all kv
#if 0 #if 0
{ {
const int n_kv = gguf_get_n_kv(ggufctx); const int n_kv = gguf_get_n_kv(ggufctx);
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv); printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) { for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ggufctx, i); const char * key = gguf_get_key(ggufctx, i);
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key); printf("%s: kv[%d]: key = %s\n", __func__, i, key);
} }
} }
#endif #endif
@ -370,21 +370,21 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
int keyidx; int keyidx;
keyidx = gguf_find_key(ggufctx, "general.name"); keyidx = gguf_find_key(ggufctx, "general.name");
if (keyidx != -1) { fprintf(stdout, "%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.description"); keyidx = gguf_find_key(ggufctx, "general.description");
if (keyidx != -1) { fprintf(stdout, "%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.author"); keyidx = gguf_find_key(ggufctx, "general.author");
if (keyidx != -1) { fprintf(stdout, "%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.license"); keyidx = gguf_find_key(ggufctx, "general.license");
if (keyidx != -1) { fprintf(stdout, "%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.architecture"); keyidx = gguf_find_key(ggufctx, "general.architecture");
if (keyidx != -1) { fprintf(stdout, "%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.file_type"); keyidx = gguf_find_key(ggufctx, "general.file_type");
if (keyidx != -1) { fprintf(stdout, "%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
if (keyidx != -1) { fprintf(stdout, "%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository"); keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
} }
// check required metadata // check required metadata
@ -395,11 +395,11 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
keyidx = gguf_find_key(ggufctx, "general.architecture"); keyidx = gguf_find_key(ggufctx, "general.architecture");
if (keyidx != -1) { if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) {
fprintf(stdout, "%s: model architecture not supported!\n", __func__); printf("%s: model architecture not supported!\n", __func__);
return false; return false;
} }
} else { } else {
fprintf(stdout, "%s: gguf model architecture not found!\n", __func__); printf("%s: gguf model architecture not found!\n", __func__);
return false; return false;
} }
@ -456,11 +456,11 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
if (keyidx != -1) { if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
fprintf(stdout, "%s: tokenizer model not supported!\n", __func__); printf("%s: tokenizer model not supported!\n", __func__);
return false; return false;
} }
} else { } else {
fprintf(stdout, "%s: tokenizer model not found!\n", __func__); printf("%s: tokenizer model not found!\n", __func__);
return false; return false;
} }
@ -468,22 +468,22 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens"); int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
if (tokens_keyidx == -1) { if (tokens_keyidx == -1) {
fprintf(stdout, "%s: gpt2 tokenizer vocab not found!\n", __func__); printf("%s: gpt2 tokenizer vocab not found!\n", __func__);
return false; return false;
} }
int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges"); int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");
if (merges_keyidx == -1) { if (merges_keyidx == -1) {
fprintf(stdout, "%s: gpt2 tokenizer merges not found!\n", __func__); printf("%s: gpt2 tokenizer merges not found!\n", __func__);
return false; return false;
} }
hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx); hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx); hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);
fprintf(stdout, "%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab); printf("%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab);
fprintf(stdout, "%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges); printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
for (size_t i = 0; i < hparams.n_vocab; i++) { for (size_t i = 0; i < hparams.n_vocab; i++) {
std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i); std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
@ -524,12 +524,12 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
if( vocab.special_bos_id != -1 ) { fprintf(stdout, "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); } if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
if( vocab.special_eos_id != -1 ) { fprintf(stdout, "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); } if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
if( vocab.special_unk_id != -1 ) { fprintf(stdout, "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); } if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
if( vocab.special_sep_id != -1 ) { fprintf(stdout, "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); } if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
if( vocab.special_pad_id != -1 ) { fprintf(stdout, "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); } if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
if( vocab.linefeed_id != -1 ) { fprintf(stdout, "%s: LF token = %d\n", __func__, vocab.linefeed_id ); } if( vocab.linefeed_id != -1 ) { printf("%s: LF token = %d\n", __func__, vocab.linefeed_id ); }
} }
@ -543,13 +543,13 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
{ {
const int n_tensors = gguf_get_n_tensors(ggufctx); const int n_tensors = gguf_get_n_tensors(ggufctx);
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors); printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ggufctx, i); const char * name = gguf_get_tensor_name (ggufctx, i);
const size_t offset = gguf_get_tensor_offset(ggufctx, i); const size_t offset = gguf_get_tensor_offset(ggufctx, i);
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
} }
} }
#endif #endif
@ -660,9 +660,10 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
ggml_tensor * gpt_neox_ff( ggml_tensor * gpt_neox_ff(
const gpt_neox_block &block, const gpt_neox_block &block,
ggml_context * ctx0, ggml_context * ctx0,
ggml_tensor * inp) { ggml_tensor * inp,
const gpt_neox_hparams &hparams) {
ggml_tensor * cur = ggml_norm(ctx0, inp); ggml_tensor * cur = ggml_norm(ctx0, inp, hparams.norm_eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur)); cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur));
cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur); cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur);
@ -753,7 +754,7 @@ bool gpt_neox_eval(
// self-attention // self-attention
{ {
{ {
cur = ggml_norm(ctx0, inpL); cur = ggml_norm(ctx0, inpL, hparams.norm_eps);
cur = ggml_add(ctx0, cur = ggml_add(ctx0,
ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur), ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur),
@ -844,7 +845,7 @@ bool gpt_neox_eval(
if (hparams.par_res == 0) { if (hparams.par_res == 0) {
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL); struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF); cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF, hparams);
// input for next layer // input for next layer
inpL = ggml_add(ctx0, cur, inpFF); inpL = ggml_add(ctx0, cur, inpFF);
@ -853,7 +854,7 @@ bool gpt_neox_eval(
// this is independent of the self-attention result, so it could be done in parallel to the self-attention // this is independent of the self-attention result, so it could be done in parallel to the self-attention
// note here we pass inpL instead of cur // note here we pass inpL instead of cur
cur = gpt_neox_ff(model.blocks[il], ctx0, inpL); cur = gpt_neox_ff(model.blocks[il], ctx0, inpL, hparams);
// layer input + FF // layer input + FF
cur = ggml_add(ctx0, cur, inpFF); cur = ggml_add(ctx0, cur, inpFF);
@ -867,7 +868,7 @@ bool gpt_neox_eval(
// norm // norm
{ {
inpL = ggml_norm(ctx0, inpL); inpL = ggml_norm(ctx0, inpL, hparams.norm_eps);
// inpL = ln_f_g*inpL + ln_f_b // inpL = ln_f_g*inpL + ln_f_b
inpL = ggml_add(ctx0, inpL = ggml_add(ctx0,

40
examples/llama-bench/llama-bench.cpp Executable file → Normal file
View file

@ -165,26 +165,26 @@ static const cmd_params cmd_params_defaults = {
}; };
static void print_usage(int /* argc */, char ** argv) { static void print_usage(int /* argc */, char ** argv) {
fprintf(stdout, "usage: %s [options]\n", argv[0]); printf("usage: %s [options]\n", argv[0]);
fprintf(stdout, "\n"); printf("\n");
fprintf(stdout, "options:\n"); printf("options:\n");
fprintf(stdout, " -h, --help\n"); printf(" -h, --help\n");
fprintf(stdout, " -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
fprintf(stdout, " -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
fprintf(stdout, " -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
fprintf(stdout, " -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
fprintf(stdout, " --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str()); printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
fprintf(stdout, " -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
fprintf(stdout, " -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
fprintf(stdout, " -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
fprintf(stdout, " -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str()); printf(" -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
fprintf(stdout, " -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
fprintf(stdout, " -ts, --tensor_split <ts0/ts1/..> \n"); printf(" -ts, --tensor_split <ts0/ts1/..> \n");
fprintf(stdout, " -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps); printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
fprintf(stdout, " -o, --output <csv|json|md|sql> (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql"); printf(" -o, --output <csv|json|md|sql> (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
fprintf(stdout, " -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
fprintf(stdout, "\n"); printf("\n");
fprintf(stdout, "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
} }

View file

@ -151,14 +151,6 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale); LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
} }
if (params.n_ctx > 2048) {
// TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
LOG_TEE("%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
} else if (params.n_ctx < 8) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
@ -194,6 +186,13 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
if (params.n_ctx > llama_n_ctx(ctx)) {
LOG_TEE("%s: warning: base model only supports context sizes no greater than %d tokens (%d specified)\n", __func__, llama_n_ctx(ctx), params.n_ctx);
} else if (params.n_ctx < 8) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
// print system information // print system information
{ {
LOG_TEE("\n"); LOG_TEE("\n");
@ -425,8 +424,9 @@ int main(int argc, char ** argv) {
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("\n\n"); LOG_TEE("\n\n");
struct llama_grammar * grammar = NULL;
grammar_parser::parse_state parsed_grammar; grammar_parser::parse_state parsed_grammar;
llama_grammar * grammar = NULL;
if (!params.grammar.empty()) { if (!params.grammar.empty()) {
parsed_grammar = grammar_parser::parse(params.grammar.c_str()); parsed_grammar = grammar_parser::parse(params.grammar.c_str());
// will be empty (default) if there are parse errors // will be empty (default) if there are parse errors
@ -450,8 +450,8 @@ int main(int argc, char ** argv) {
} }
// TODO: replace with ring-buffer // TODO: replace with ring-buffer
std::vector<llama_token> last_n_tokens(n_ctx); std::vector<llama_token> last_tokens(n_ctx);
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); std::fill(last_tokens.begin(), last_tokens.end(), 0);
if (params.interactive) { if (params.interactive) {
const char *control_message; const char *control_message;
@ -492,13 +492,10 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd; std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance; std::vector<llama_token> embd_guidance;
{ const int n_vocab = llama_n_vocab(ctx);
LOG("warming up the model with an empty run\n");
const std::vector<llama_token> tmp = { llama_token_bos(ctx), }; std::vector<llama_token_data> candidates;
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); candidates.reserve(n_vocab);
llama_reset_timings(ctx);
}
while ((n_remain != 0 && !is_antiprompt) || params.interactive) { while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict // predict
@ -537,8 +534,8 @@ int main(int argc, char ** argv) {
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance); LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
// insert n_left/2 tokens at the start of embd from last_n_tokens // insert n_left/2 tokens at the start of embd from last_tokens
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); embd.insert(embd.begin(), last_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_tokens.end() - embd.size());
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd)); LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
@ -637,20 +634,6 @@ int main(int argc, char ** argv) {
embd_guidance.clear(); embd_guidance.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) { if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
const float top_p = params.top_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
const float repeat_penalty = params.repeat_penalty;
const float alpha_presence = params.presence_penalty;
const float alpha_frequency = params.frequency_penalty;
const int mirostat = params.mirostat;
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
const bool penalize_nl = params.penalize_nl;
// optionally save the session on first sample (for faster prompt loading next time) // optionally save the session on first sample (for faster prompt loading next time)
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) { if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
need_to_save_session = false; need_to_save_session = false;
@ -659,98 +642,12 @@ int main(int argc, char ** argv) {
LOG("saved session to %s\n", path_session.c_str()); LOG("saved session to %s\n", path_session.c_str());
} }
llama_token id = 0; const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
{ last_tokens.erase(last_tokens.begin());
auto logits = llama_get_logits(ctx); last_tokens.push_back(id);
auto n_vocab = llama_n_vocab(ctx);
// Apply params.logit_bias map LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
if (ctx_guidance) {
llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
}
// Apply penalties
float nl_logit = logits[llama_token_nl(ctx)];
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
llama_sample_repetition_penalty(ctx, &cur_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
last_n_repeat, repeat_penalty);
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
last_n_repeat, alpha_frequency, alpha_presence);
if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
cur_p.data[idx].logit = nl_logit;
break;
}
}
}
if (grammar != NULL) {
llama_sample_grammar(ctx, &cur_p, grammar);
}
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx, &cur_p);
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temperature(ctx, &cur_p, temp);
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temperature(ctx, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
llama_sample_top_k (ctx, &cur_p, top_k, 1);
llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
llama_sample_typical (ctx, &cur_p, typical_p, 1);
llama_sample_top_p (ctx, &cur_p, top_p, 1);
llama_sample_temperature(ctx, &cur_p, temp);
{
const int n_top = 10;
LOG("top %d candidates:\n", n_top);
for (int i = 0; i < n_top; i++) {
const llama_token id = cur_p.data[i].id;
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
}
}
id = llama_sample_token(ctx, &cur_p);
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
}
}
// printf("`%d`", candidates_p.size);
if (grammar != NULL) {
llama_grammar_accept_token(ctx, grammar, id);
}
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_n_tokens));
}
embd.push_back(id); embd.push_back(id);
@ -766,8 +663,8 @@ int main(int argc, char ** argv) {
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) { while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]); embd.push_back(embd_inp[n_consumed]);
last_n_tokens.erase(last_n_tokens.begin()); last_tokens.erase(last_tokens.begin());
last_n_tokens.push_back(embd_inp[n_consumed]); last_tokens.push_back(embd_inp[n_consumed]);
++n_consumed; ++n_consumed;
if ((int) embd.size() >= params.n_batch) { if ((int) embd.size() >= params.n_batch) {
break; break;
@ -800,7 +697,7 @@ int main(int argc, char ** argv) {
// check for reverse prompt // check for reverse prompt
if (params.antiprompt.size()) { if (params.antiprompt.size()) {
std::string last_output; std::string last_output;
for (auto id : last_n_tokens) { for (auto id : last_tokens) {
last_output += llama_token_to_piece(ctx, id); last_output += llama_token_to_piece(ctx, id);
} }
@ -831,7 +728,7 @@ int main(int argc, char ** argv) {
} }
// deal with end of text token in interactive mode // deal with end of text token in interactive mode
if (last_n_tokens.back() == llama_token_eos(ctx)) { if (last_tokens.back() == llama_token_eos(ctx)) {
LOG("found EOS token\n"); LOG("found EOS token\n");
if (params.interactive) { if (params.interactive) {

View file

@ -368,7 +368,7 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
// Example, we have a context window of 512, we will compute perplexity for each of the // Example, we have a context window of 512, we will compute perplexity for each of the
// last 256 tokens. Then, we split the input up into context window size chunks to // last 256 tokens. Then, we split the input up into context window size chunks to
// process the entire prompt. // process the entire prompt.
const int first = std::min(512, params.n_ctx/2); const int first = params.n_ctx/2;
process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first, process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
count += params.n_ctx - first - 1; count += params.n_ctx - first - 1;
@ -668,11 +668,6 @@ int main(int argc, char ** argv) {
params.n_ctx += params.ppl_stride/2; params.n_ctx += params.ppl_stride/2;
} }
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
@ -698,6 +693,11 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
if (params.n_ctx > llama_n_ctx(ctx)) {
fprintf(stderr, "%s: warning: model might not support context sizes greater than %d tokens (%d specified);"
"expect poor results\n", __func__, llama_n_ctx(ctx), params.n_ctx);
}
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");

File diff suppressed because it is too large Load diff

View file

@ -145,7 +145,29 @@
color: #888; color: #888;
} }
@keyframes loading-bg-wipe {
0% {
background-position: 0%;
}
100% {
background-position: 100%;
}
}
.loading {
--loading-color-1: #eeeeee00;
--loading-color-2: #eeeeeeff;
background-size: 50% 100%;
background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1));
animation: loading-bg-wipe 2s linear infinite;
}
@media (prefers-color-scheme: dark) { @media (prefers-color-scheme: dark) {
.loading {
--loading-color-1: #22222200;
--loading-color-2: #222222ff;
}
.popover-content { .popover-content {
background-color: black; background-color: black;
} }
@ -321,7 +343,10 @@
const llamaStats = signal(null) const llamaStats = signal(null)
const controller = signal(null) const controller = signal(null)
const generating = computed(() => controller.value == null ) // currently generating a completion?
const generating = computed(() => controller.value != null)
// has the user started a chat?
const chatStarted = computed(() => session.value.transcript.length > 0) const chatStarted = computed(() => session.value.transcript.length > 0)
const transcriptUpdate = (transcript) => { const transcriptUpdate = (transcript) => {
@ -430,11 +455,19 @@
return html` return html`
<form onsubmit=${submit}> <form onsubmit=${submit}>
<div> <div>
<textarea type="text" rows=2 onkeypress=${enterSubmits} value="${message}" oninput=${(e) => message.value = e.target.value} placeholder="Say something..."/> <textarea
className=${generating.value ? "loading" : null}
oninput=${(e) => message.value = e.target.value}
onkeypress=${enterSubmits}
placeholder="Say something..."
rows=2
type="text"
value="${message}"
/>
</div> </div>
<div class="right"> <div class="right">
<button type="submit" disabled=${!generating.value} >Send</button> <button type="submit" disabled=${generating.value}>Send</button>
<button onclick=${stop} disabled=${generating}>Stop</button> <button onclick=${stop} disabled=${!generating.value}>Stop</button>
<button onclick=${reset}>Reset</button> <button onclick=${reset}>Reset</button>
</div> </div>
</form> </form>

View file

@ -118,7 +118,7 @@ static void server_log(const char *level, const char *function, int line,
} }
const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
fprintf(stdout, "%.*s\n", (int)str.size(), str.data()); printf("%.*s\n", (int)str.size(), str.data());
fflush(stdout); fflush(stdout);
} }
@ -694,50 +694,50 @@ struct llama_server_context
static void server_print_usage(const char *argv0, const gpt_params &params, static void server_print_usage(const char *argv0, const gpt_params &params,
const server_params &sparams) const server_params &sparams)
{ {
fprintf(stdout, "usage: %s [options]\n", argv0); printf("usage: %s [options]\n", argv0);
fprintf(stdout, "\n"); printf("\n");
fprintf(stdout, "options:\n"); printf("options:\n");
fprintf(stdout, " -h, --help show this help message and exit\n"); printf(" -h, --help show this help message and exit\n");
fprintf(stdout, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); printf(" --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); printf(" --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
if (llama_mlock_supported()) if (llama_mlock_supported())
{ {
fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
} }
if (llama_mmap_supported()) if (llama_mmap_supported())
{ {
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
} }
fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n"); printf(" --numa attempt optimizations that help on some NUMA systems\n");
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stdout, " -ngl N, --n-gpu-layers N\n"); printf(" -ngl N, --n-gpu-layers N\n");
fprintf(stdout, " number of layers to store in VRAM\n"); printf(" number of layers to store in VRAM\n");
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n"); printf(" -ts SPLIT --tensor-split SPLIT\n");
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n");
fprintf(stdout, " -nommq, --no-mul-mat-q\n"); printf(" -nommq, --no-mul-mat-q\n");
fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n"); printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n"); printf(" Not recommended since this is both slower and uses more VRAM.\n");
#endif #endif
fprintf(stdout, " -m FNAME, --model FNAME\n"); printf(" -m FNAME, --model FNAME\n");
fprintf(stdout, " model path (default: %s)\n", params.model.c_str()); printf(" model path (default: %s)\n", params.model.c_str());
fprintf(stdout, " -a ALIAS, --alias ALIAS\n"); printf(" -a ALIAS, --alias ALIAS\n");
fprintf(stdout, " set an alias for the model, will be added as `model` field in completion response\n"); printf(" set an alias for the model, will be added as `model` field in completion response\n");
fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stdout, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
fprintf(stdout, " --port PORT port to listen (default (default: %d)\n", sparams.port); printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
fprintf(stdout, " --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
fprintf(stdout, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
fprintf(stdout, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
fprintf(stdout, "\n"); printf("\n");
} }
static void server_params_parse(int argc, char **argv, server_params &sparams, static void server_params_parse(int argc, char **argv, server_params &sparams,
@ -1379,7 +1379,13 @@ int main(int argc, char **argv)
} }
} }
const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs); auto probs = llama.generated_token_probs;
if (llama.params.n_probs > 0 && llama.stopped_word) {
const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
}
const json data = format_final_response(llama, llama.generated_text, probs);
llama_print_timings(llama.ctx); llama_print_timings(llama.ctx);
@ -1456,7 +1462,11 @@ int main(int argc, char **argv)
if (!llama.has_next_token) { if (!llama.has_next_token) {
// Generation is done, send extra information. // Generation is done, send extra information.
const json data = format_final_response(llama, "", llama.generated_token_probs); const json data = format_final_response(
llama,
"",
std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.begin() + sent_token_probs_index)
);
const std::string str = const std::string str =
"data: " + "data: " +
@ -1585,7 +1595,7 @@ int main(int argc, char **argv)
svr.set_base_dir(sparams.public_path); svr.set_base_dir(sparams.public_path);
// to make it ctrl+clickable: // to make it ctrl+clickable:
fprintf(stdout, "\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); printf("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
LOG_INFO("HTTP server listening", { LOG_INFO("HTTP server listening", {
{"hostname", sparams.hostname}, {"hostname", sparams.hostname},

View file

@ -0,0 +1,8 @@
set(TARGET speculative)
add_executable(${TARGET} speculative.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -0,0 +1,292 @@
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "build-info.h"
#include "common.h"
#include "llama.h"
#include "grammar-parser.h"
#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
int main(int argc, char ** argv) {
gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
if (params.model_draft.empty()) {
fprintf(stderr, "%s: error: --model-draft is required\n", __func__);
return 1;
}
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("speculative", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
// init llama.cpp
llama_backend_init(params.numa);
llama_model * model_tgt = NULL;
llama_model * model_dft = NULL;
llama_context * ctx_tgt = NULL;
llama_context * ctx_dft = NULL;
// load the target model
params.perplexity = true; // HACK: enable logits_all = true
std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
// load the draft model
params.model = params.model_draft;
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
// tokenize the prompt
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
const int max_context_size = llama_n_ctx(ctx_tgt);
const int max_tokens_list_size = max_context_size - 4;
if ((int) inp.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
return 1;
}
fprintf(stderr, "\n\n");
for (auto id : inp) {
fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
}
fflush(stderr);
const int n_input = inp.size();
const auto t_enc_start = ggml_time_us();
// eval the prompt with both models
llama_eval(ctx_tgt, inp.data(), int(inp.size() - 1), 0, params.n_threads);
llama_eval(ctx_tgt, &inp.back(), 1, inp.size() - 1, params.n_threads);
llama_eval(ctx_dft, inp.data(), int(inp.size()), 0, params.n_threads);
const auto t_enc_end = ggml_time_us();
// the 2 models should have the same vocab
const int n_ctx = llama_n_ctx(ctx_tgt);
const int n_vocab = llama_n_vocab(ctx_tgt);
//GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft));
// how many tokens to draft each time
const int n_draft = params.n_draft;
int n_predict = 0;
int n_drafted = 0;
int n_accept = 0;
int n_past_tgt = inp.size();
int n_past_dft = inp.size();
std::vector<llama_token> drafted;
std::vector<llama_token> last_tokens(n_ctx);
std::fill(last_tokens.begin(), last_tokens.end(), 0);
for (auto & id : inp) {
last_tokens.erase(last_tokens.begin());
last_tokens.push_back(id);
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
// used to determine end of generation
bool has_eos = false;
// grammar stuff
struct llama_grammar * grammar_dft = NULL;
struct llama_grammar * grammar_tgt = NULL;
grammar_parser::parse_state parsed_grammar;
// if requested - load the grammar, error checking is omitted for brevity
if (!params.grammar.empty()) {
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
// will be empty (default) if there are parse errors
if (parsed_grammar.rules.empty()) {
return 1;
}
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
}
const auto t_dec_start = ggml_time_us();
while (true) {
LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
int i_dft = 0;
while (true) {
// sample from the target model
const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
// remember which tokens were sampled - used for repetition penalties during sampling
last_tokens.erase(last_tokens.begin());
last_tokens.push_back(id);
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, last_tokens));
const std::string token_str = llama_token_to_piece(ctx_tgt, id);
printf("%s", token_str.c_str());
fflush(stdout);
if (id == llama_token_eos(ctx_tgt)) {
has_eos = true;
}
++n_predict;
// check if the draft matches the target
if (i_dft < (int) drafted.size() && id == drafted[i_dft]) {
LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
++n_accept;
++n_past_tgt;
++n_past_dft;
++i_dft;
continue;
}
// the drafted token was rejected or we are out of drafted tokens
if (i_dft < (int) drafted.size()) {
LOG("the %dth drafted token (%d, '%s') does not match the sampled target token (%d, '%s') - rejected\n",
i_dft, drafted[i_dft], llama_token_to_piece(ctx_dft, drafted[i_dft]).c_str(), id, token_str.c_str());
} else {
LOG("out of drafted tokens\n");
}
llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
++n_past_dft;
drafted.clear();
drafted.push_back(id);
break;
}
if (n_predict > params.n_predict || has_eos) {
break;
}
if (grammar_tgt) {
if (grammar_dft) {
llama_grammar_free(grammar_dft);
}
grammar_dft = llama_grammar_copy(grammar_tgt);
LOG("copied target grammar to draft grammar\n");
}
// sample n_draft tokens from the draft model using greedy decoding
int n_past_cur = n_past_dft;
for (int i = 0; i < n_draft; ++i) {
float * logits = llama_get_logits(ctx_dft);
candidates.clear();
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
if (grammar_dft != NULL) {
llama_sample_grammar(ctx_dft, &cur_p, grammar_dft);
}
// computes softmax and sorts the candidates
llama_sample_softmax(ctx_dft, &cur_p);
for (int i = 0; i < 3; ++i) {
LOG(" - draft candidate %3d: %6d (%8.3f) '%s'\n", i, cur_p.data[i].id, cur_p.data[i].p, llama_token_to_piece(ctx_dft, cur_p.data[i].id).c_str());
}
// TODO: better logic?
if (cur_p.data[0].p < 2*cur_p.data[1].p) {
LOG("stopping drafting, probability too low: %.3f < 2*%.3f\n", cur_p.data[0].p, cur_p.data[1].p);
break;
}
// drafted token
const llama_token id = cur_p.data[0].id;
drafted.push_back(id);
++n_drafted;
// no need to evaluate the last drafted token, since we won't use the result
if (i == n_draft - 1) {
break;
}
// evaluate the drafted token on the draft model
llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads);
++n_past_cur;
if (grammar_dft != NULL) {
llama_grammar_accept_token(ctx_dft, grammar_dft, id);
}
}
// evaluate the target model on the drafted tokens
llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads);
++n_past_tgt;
// the first token is always proposed by the traget model before the speculation loop
drafted.erase(drafted.begin());
}
auto t_dec_end = ggml_time_us();
LOG_TEE("\n\n");
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
// TODO: make sure these numbers are computed correctly
LOG_TEE("\n");
LOG_TEE("n_draft = %d\n", n_draft);
LOG_TEE("n_predict = %d\n", n_predict);
LOG_TEE("n_drafted = %d\n", n_drafted);
LOG_TEE("n_accept = %d\n", n_accept);
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
LOG_TEE("\ndraft:\n");
llama_print_timings(ctx_dft);
LOG_TEE("\ntarget:\n");
llama_print_timings(ctx_tgt);
llama_free(ctx_tgt);
llama_free_model(model_tgt);
llama_free(ctx_dft);
llama_free_model(model_dft);
if (grammar_dft != NULL) {
llama_grammar_free(grammar_dft);
llama_grammar_free(grammar_tgt);
}
llama_backend_free();
fprintf(stderr, "\n\n");
return 0;
}

View file

@ -1,3 +1,8 @@
// defines MAP_ANONYMOUS
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml.h" #include "ggml.h"
#include <assert.h> #include <assert.h>
@ -6,6 +11,26 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#ifdef __has_include
#if __has_include(<unistd.h>)
#include <unistd.h>
#if defined(_POSIX_MAPPED_FILES)
#include <sys/types.h>
#include <sys/mman.h>
#endif
#endif
#endif
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <memoryapi.h>
#endif
#define UNUSED(x) (void)(x) #define UNUSED(x) (void)(x)
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES) #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
@ -99,19 +124,24 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
} }
#endif #endif
static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
return ggml_nbytes(tensor); return ggml_nbytes(tensor);
UNUSED(alloc); UNUSED(alloc);
} }
// check if a tensor is allocated by this buffer
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
void * ptr = tensor->data;
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
}
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
#endif #endif
size_t size = ggml_allocator_get_alloc_size(alloc, tensor); size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
size = aligned_offset(NULL, size, alloc->alignment); size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@ -177,17 +207,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
} }
// this is a very naive implementation, but for our case the number of free blocks should be very small // this is a very naive implementation, but for our case the number of free blocks should be very small
static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
void * ptr = tensor->data; void * ptr = tensor->data;
if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) { if (ggml_allocr_is_own(alloc, tensor) == false) {
// the tensor was not allocated in this buffer // the tensor was not allocated in this buffer
// this can happen because the graph allocator will try to free weights and other tensors from different buffers // this can happen because the graph allocator will try to free weights and other tensors from different buffers
// the easiest way to deal with this is just to ignore it // the easiest way to deal with this is just to ignore it
return; return;
} }
size_t size = ggml_allocator_get_alloc_size(alloc, tensor); size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
size = aligned_offset(NULL, size, alloc->alignment); size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks); AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
@ -281,17 +311,64 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
return alloc; return alloc;
} }
// address and size of the buffer when measuring // OS specific functions to allocate and free uncommitted virtual memory
// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers static void * alloc_vmem(size_t size) {
static void * const MEASURE_BASE_ADDR = (void *) 0x1000; #if defined(_WIN32)
static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
#elif defined(_POSIX_MAPPED_FILES)
return mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
#else
// use a fixed address for other platforms
uintptr_t base_addr = (uintptr_t)-size - 0x100;
return (void *)base_addr;
#endif
}
static void free_vmem(void * base_addr, size_t size) {
#if defined(_WIN32)
VirtualFree(base_addr, 0, MEM_RELEASE);
UNUSED(size);
#elif defined(_POSIX_MAPPED_FILES)
munmap(base_addr, size);
#else
// nothing to do
UNUSED(base_addr);
UNUSED(size);
#endif
}
// allocate uncommitted virtual memory to measure the size of the graph
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
// 1TB for 64-bit, 1GB for 32-bit
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
do {
*base_addr = alloc_vmem(*size);
if (*base_addr != NULL) {
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
return;
}
// try again with half the size
*size /= 2;
} while (*size > 0);
GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
}
static void free_measure_vmem(void * base_addr, size_t size) {
free_vmem(base_addr, size);
}
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
void * base_addr;
size_t size;
alloc_measure_vmem(&base_addr, &size);
*alloc = (struct ggml_allocr){ *alloc = (struct ggml_allocr){
/*.data = */ MEASURE_BASE_ADDR, /*.data = */ base_addr,
/*.size = */ MEASURE_MAX_SIZE, /*.size = */ size,
/*.alignment = */ alignment, /*.alignment = */ alignment,
/*.n_free_blocks = */ 0, /*.n_free_blocks = */ 0,
/*.free_blocks = */ {{0}}, /*.free_blocks = */ {{0}},
@ -311,6 +388,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
} }
void ggml_allocr_free(struct ggml_allocr * alloc) { void ggml_allocr_free(struct ggml_allocr * alloc) {
if (alloc->measure) {
free_measure_vmem(alloc->data, alloc->size);
}
free(alloc); free(alloc);
} }
@ -380,8 +460,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
} }
// if the node's data is external, then we cannot re-use it // if the node's data is external, then we cannot re-use it
if ((char *) parent->data < (char *) alloc->data || if (ggml_allocr_is_own(alloc, parent) == false) {
(char *) parent->data >= ((char *) alloc->data + alloc->size)) {
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
continue; continue;
} }
@ -415,7 +494,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
} }
} }
static size_t ggml_allocator_alloc_graph_tensors_n( static size_t ggml_allocr_alloc_graph_tensors_n(
struct ggml_allocr * alloc, struct ggml_allocr * alloc,
struct ggml_cgraph ** graphs, int n_graphs, struct ggml_cgraph ** graphs, int n_graphs,
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) { struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@ -493,7 +572,6 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
AT_PRINTF("\n"); AT_PRINTF("\n");
} }
// update parents // update parents
// update immediately if there is no parse_seq // update immediately if there is no parse_seq
// update only at barriers if there is parse_seq // update only at barriers if there is parse_seq
@ -521,12 +599,12 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
view_src_hn->n_views -= 1; view_src_hn->n_views -= 1;
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views); AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) { if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
ggml_allocator_free_tensor(alloc, view_src); ggml_allocr_free_tensor(alloc, view_src);
} }
} }
else { else {
if (parent->data != node->data) { if (parent->data != node->data) {
ggml_allocator_free_tensor(alloc, parent); ggml_allocr_free_tensor(alloc, parent);
} }
} }
} }
@ -543,7 +621,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
for (int i = 0; outputs[g][i] != NULL; i++) { for (int i = 0; outputs[g][i] != NULL; i++) {
struct ggml_tensor * output = outputs[g][i]; struct ggml_tensor * output = outputs[g][i];
AT_PRINTF("output: %s\n", output->name); AT_PRINTF("output: %s\n", output->name);
ggml_allocator_free_tensor(alloc, output); ggml_allocr_free_tensor(alloc, output);
} }
} }
} }
@ -552,5 +630,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
} }
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) { size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL); return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
} }

View file

@ -81,12 +81,29 @@
#if defined(GGML_USE_HIPBLAS) #if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300 #define __CUDA_ARCH__ 1300
#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
static __device__ __forceinline__ int __vsubss4(const int a, const int b) { static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a); const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b); const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
#if __has_builtin(__builtin_elementwise_sub_sat)
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
return reinterpret_cast<const int&>(c); return reinterpret_cast<const int&>(c);
#else
int8x4_t c;
int16_t tmp;
#pragma unroll
for (int i = 0; i < 4; i++) {
tmp = va[i] - vb[i];
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
c[i] = tmp;
}
return reinterpret_cast<int&>(c);
#endif // __has_builtin(__builtin_elementwise_sub_sat)
} }
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
@ -447,58 +464,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
dst[i] = x[i] / (1.0f + expf(-x[i])); dst[i] = x[i] / (1.0f + expf(-x[i]));
} }
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
}
return a;
}
template <int block_size>
static __global__ void norm_f32(const float * x, float * dst, const int ncols) { static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
const int row = blockIdx.x*blockDim.y + threadIdx.y; const int row = blockIdx.x*blockDim.y + threadIdx.y;
const int tid = threadIdx.x; const int tid = threadIdx.x;
const float eps = 1e-5f; const float eps = 1e-5f;
float mean = 0.0f; float2 mean_var = make_float2(0.f, 0.f);
float var = 0.0f;
for (int col = tid; col < ncols; col += WARP_SIZE) { for (int col = tid; col < ncols; col += block_size) {
const float xi = x[row*ncols + col]; const float xi = x[row*ncols + col];
mean += xi; mean_var.x += xi;
var += xi * xi; mean_var.y += xi * xi;
} }
// sum up partial sums // sum up partial sums
mean_var = warp_reduce_sum(mean_var);
if (block_size > WARP_SIZE) {
__shared__ float2 s_sum[32];
int warp_id = threadIdx.x / WARP_SIZE;
int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) {
s_sum[warp_id] = mean_var;
}
__syncthreads();
mean_var = s_sum[lane_id];
mean_var = warp_reduce_sum(mean_var);
}
const float mean = mean_var.x / ncols;
const float var = mean_var.y / ncols - mean * mean;
const float inv_std = rsqrtf(var + eps);
for (int col = tid; col < ncols; col += block_size) {
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
}
}
static __device__ __forceinline__ float warp_reduce_sum(float x) {
#pragma unroll #pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) { for (int mask = 16; mask > 0; mask >>= 1) {
mean += __shfl_xor_sync(0xffffffff, mean, mask, 32); x += __shfl_xor_sync(0xffffffff, x, mask, 32);
var += __shfl_xor_sync(0xffffffff, var, mask, 32); }
} return x;
mean /= ncols;
var = var / ncols - mean * mean;
const float inv_var = rsqrtf(var + eps);
for (int col = tid; col < ncols; col += WARP_SIZE) {
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
}
} }
template <int block_size>
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) { static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
const int row = blockIdx.x*blockDim.y + threadIdx.y; const int row = blockIdx.x*blockDim.y + threadIdx.y;
const int tid = threadIdx.x; const int tid = threadIdx.x;
float tmp = 0.0f; // partial sum for thread in warp float tmp = 0.0f; // partial sum for thread in warp
for (int col = tid; col < ncols; col += WARP_SIZE) { for (int col = tid; col < ncols; col += block_size) {
const float xi = x[row*ncols + col]; const float xi = x[row*ncols + col];
tmp += xi * xi; tmp += xi * xi;
} }
// sum up partial sums // sum up partial sums
#pragma unroll tmp = warp_reduce_sum(tmp);
for (int mask = 16; mask > 0; mask >>= 1) { if (block_size > WARP_SIZE) {
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); __shared__ float s_sum[32];
int warp_id = threadIdx.x / WARP_SIZE;
int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) {
s_sum[warp_id] = tmp;
}
__syncthreads();
tmp = s_sum[lane_id];
tmp = warp_reduce_sum(tmp);
} }
const float mean = tmp / ncols; const float mean = tmp / ncols;
const float scale = rsqrtf(mean + eps); const float scale = rsqrtf(mean + eps);
for (int col = tid; col < ncols; col += WARP_SIZE) { for (int col = tid; col < ncols; col += block_size) {
dst[row*ncols + col] = scale * x[row*ncols + col]; dst[row*ncols + col] = scale * x[row*ncols + col];
} }
} }
@ -4186,14 +4236,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % WARP_SIZE == 0); GGML_ASSERT(ncols % WARP_SIZE == 0);
if (ncols < 1024) {
const dim3 block_dims(WARP_SIZE, 1, 1); const dim3 block_dims(WARP_SIZE, 1, 1);
norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols); norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
} else {
const dim3 block_dims(1024, 1, 1);
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
}
} }
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) { static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
GGML_ASSERT(ncols % WARP_SIZE == 0); GGML_ASSERT(ncols % WARP_SIZE == 0);
if (ncols < 1024) {
const dim3 block_dims(WARP_SIZE, 1, 1); const dim3 block_dims(WARP_SIZE, 1, 1);
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps); rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
} else {
const dim3 block_dims(1024, 1, 1);
rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
}
} }
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) { static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {

View file

@ -76,6 +76,7 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(rms_norm); GGML_METAL_DECL_KERNEL(rms_norm);
GGML_METAL_DECL_KERNEL(norm); GGML_METAL_DECL_KERNEL(norm);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32); GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32); GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
@ -116,10 +117,24 @@ static NSString * const msl_library_source = @"see metal.metal";
struct ggml_metal_context * ggml_metal_init(int n_cb) { struct ggml_metal_context * ggml_metal_init(int n_cb) {
metal_printf("%s: allocating\n", __func__); metal_printf("%s: allocating\n", __func__);
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); // Show all the Metal device instances in the system
NSArray * devices = MTLCopyAllDevices();
id <MTLDevice> device;
NSString * s;
for (device in devices) {
s = [device name];
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
}
// Pick and show default Metal device
device = MTLCreateSystemDefaultDevice();
s = [device name];
metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
// Configure context
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
ctx->device = device;
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
ctx->device = MTLCreateSystemDefaultDevice();
ctx->queue = [ctx->device newCommandQueue]; ctx->queue = [ctx->device newCommandQueue];
ctx->n_buffers = 0; ctx->n_buffers = 0;
ctx->concur_list_len = 0; ctx->concur_list_len = 0;
@ -205,6 +220,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(rms_norm); GGML_METAL_ADD_KERNEL(rms_norm);
GGML_METAL_ADD_KERNEL(norm); GGML_METAL_ADD_KERNEL(norm);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32); GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32); GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
@ -270,6 +286,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(rms_norm); GGML_METAL_DEL_KERNEL(rms_norm);
GGML_METAL_DEL_KERNEL(norm); GGML_METAL_DEL_KERNEL(norm);
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32); GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32); GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32); GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32); GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
@ -854,7 +871,11 @@ void ggml_metal_graph_compute(
{ {
nth0 = 32; nth0 = 32;
nth1 = 1; nth1 = 1;
if (ne11 * ne12 < 4) {
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
} else {
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
}
} break; } break;
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
{ {
@ -906,8 +927,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 2; nth0 = 4; //1;
nth1 = 32; nth1 = 8; //32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
} break; } break;
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
@ -955,9 +976,12 @@ void ggml_metal_graph_compute(
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17]; [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 || if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) { src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_Q3_K) { else if (src0t == GGML_TYPE_Q3_K) {
#ifdef GGML_QKK_64 #ifdef GGML_QKK_64
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@ -971,8 +995,8 @@ void ggml_metal_graph_compute(
else if (src0t == GGML_TYPE_Q6_K) { else if (src0t == GGML_TYPE_Q6_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else { } else {
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0]; int64_t ny = (ne11 + 3)/4;
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
} }
} break; } break;

View file

@ -133,19 +133,24 @@ kernel void kernel_soft_max(
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
} }
// broadcast //// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
if (tpitg[0] == 0) { // the loop, and when that is done, buf[0] has the correct (synchronized) value
buf[0] = buf[0]; //if (tpitg[0] == 0) {
} // buf[0] = buf[0];
//}
threadgroup_barrier(mem_flags::mem_threadgroup); //threadgroup_barrier(mem_flags::mem_threadgroup);
const float max = buf[0]; const float max = buf[0];
// parallel sum // parallel sum
buf[tpitg[0]] = 0.0f; buf[tpitg[0]] = 0.0f;
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
buf[tpitg[0]] += exp(psrc0[i00] - max); const float exp_psrc0 = exp(psrc0[i00] - max);
buf[tpitg[0]] += exp_psrc0;
// Remember the result of exp here. exp is expensive, so we really do not
// whish to compute it twice.
pdst[i00] = exp_psrc0;
} }
// reduce // reduce
@ -157,17 +162,18 @@ kernel void kernel_soft_max(
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
} }
// broadcast // broadcast - not needed, see above
if (tpitg[0] == 0) { //// broadcast
buf[0] = buf[0]; //if (tpitg[0] == 0) {
} // buf[0] = buf[0];
//}
threadgroup_barrier(mem_flags::mem_threadgroup); //threadgroup_barrier(mem_flags::mem_threadgroup);
const float sum = buf[0]; const float sum = buf[0];
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
pdst[i00] = exp(psrc0[i00] - max) / sum; pdst[i00] /= sum;
} }
} }
@ -214,25 +220,27 @@ kernel void kernel_norm(
} }
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
} }
// broadcast //// broadcast
if (tpitg == 0) { //if (tpitg == 0) {
sum[0] /= ne00; // sum[0] /= ne00;
} //}
threadgroup_barrier(mem_flags::mem_threadgroup); //threadgroup_barrier(mem_flags::mem_threadgroup);
const float mean = sum[0]; const float mean = sum[0];
// recenter // recenter and VARIANCE
device float * y = dst + tgpig*ne00; device float * y = dst + tgpig*ne00;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
y[i00] = x[i00] - mean;
}
// VARIANCE
// parallel sum
sum[tpitg] = 0.0f; sum[tpitg] = 0.0f;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) { for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
y[i00] = x[i00] - mean;
sum[tpitg] += y[i00] * y[i00]; sum[tpitg] += y[i00] * y[i00];
} }
//// VARIANCE
//// parallel sum
//sum[tpitg] = 0.0f;
//for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
// sum[tpitg] += y[i00] * y[i00];
//}
// reduce // reduce
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
for (uint i = ntg/2; i > 0; i /= 2) { for (uint i = ntg/2; i > 0; i /= 2) {
@ -241,11 +249,11 @@ kernel void kernel_norm(
} }
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
} }
// broadcast //// broadcast
if (tpitg == 0) { //if (tpitg == 0) {
sum[0] /= ne00; // sum[0] /= ne00;
} //}
threadgroup_barrier(mem_flags::mem_threadgroup); //threadgroup_barrier(mem_flags::mem_threadgroup);
const float variance = sum[0]; const float variance = sum[0];
const float scale = 1.0f/sqrt(variance + eps); const float scale = 1.0f/sqrt(variance + eps);
@ -435,6 +443,8 @@ kernel void kernel_mul_mat_q4_1_f32(
mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg); mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
} }
#define NB_Q8_0 8
kernel void kernel_mul_mat_q8_0_f32( kernel void kernel_mul_mat_q8_0_f32(
device const void * src0, device const void * src0,
device const float * src1, device const float * src1,
@ -463,30 +473,30 @@ kernel void kernel_mul_mat_q8_0_f32(
device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0; device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
float yl[16]; float yl[NB_Q8_0];
float sumf[nr]={0.f}; float sumf[nr]={0.f};
const int ix = tiisg/2; const int ix = tiisg/4;
const int il = tiisg%2; const int il = tiisg%4;
device const float * yb = y + ix * QK8_0 + 16*il; device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
// each thread in a SIMD group deals with half a block. // each thread in a SIMD group deals with NB_Q8_0 quants at a time
for (int ib = ix; ib < nb; ib += nw/2) { for (int ib = ix; ib < nb; ib += nw/4) {
for (int i = 0; i < 16; ++i) { for (int i = 0; i < NB_Q8_0; ++i) {
yl[i] = yb[i]; yl[i] = yb[i];
} }
for (int row = 0; row < nr; row++) { for (int row = 0; row < nr; row++) {
device const int8_t * qs = x[ib+row*nb].qs + 16*il; device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
float sumq = 0.f; float sumq = 0.f;
for (int iq = 0; iq < 16; ++iq) { for (int iq = 0; iq < NB_Q8_0; ++iq) {
sumq += qs[iq] * yl[iq]; sumq += qs[iq] * yl[iq];
} }
sumf[row] += sumq*x[ib+row*nb].d; sumf[row] += sumq*x[ib+row*nb].d;
} }
yb += QK8_0 * 16; yb += NB_Q8_0 * nw;
} }
for (int row = 0; row < nr; ++row) { for (int row = 0; row < nr; ++row) {
@ -497,6 +507,60 @@ kernel void kernel_mul_mat_q8_0_f32(
} }
} }
kernel void kernel_mul_mat_f16_f32_1row(
device const char * src0,
device const char * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]]) {
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
const int64_t im = tgpig.z;
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
float sumf = 0;
if (ne00 < 128) {
for (int i = tiisg; i < ne00; i += 32) {
sumf += (float) x[i] * (float) y[i];
}
float all_sum = simd_sum(sumf);
if (tiisg == 0) {
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
}
} else {
device const half4 * x4 = (device const half4 *) x;
device const float4 * y4 = (device const float4 *) y;
for (int i = tiisg; i < ne00/4; i += 32) {
for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
}
float all_sum = simd_sum(sumf);
if (tiisg == 0) {
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
}
}
}
#define N_F16_F32 4
kernel void kernel_mul_mat_f16_f32( kernel void kernel_mul_mat_f16_f32(
device const char * src0, device const char * src0,
device const char * src1, device const char * src1,
@ -515,55 +579,58 @@ kernel void kernel_mul_mat_f16_f32(
constant uint64_t & nb12, constant uint64_t & nb12,
constant int64_t & ne0, constant int64_t & ne0,
constant int64_t & ne1, constant int64_t & ne1,
threadgroup float * sum [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]], uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpig[[thread_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) {
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 tptg[[threads_per_threadgroup]]) {
const int64_t r0 = tgpig.x; const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y; const int64_t rb = tgpig.y*N_F16_F32;
const int64_t im = tgpig.z; const int64_t im = tgpig.z;
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
if (ne00 < 128) {
for (int row = 0; row < N_F16_F32; ++row) {
int r1 = rb + row;
if (r1 >= ne11) {
break;
}
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
uint ith = tpitg.x; float sumf = 0;
uint nth = tptg.x; for (int i = tiisg; i < ne00; i += 32) {
sumf += (float) x[i] * (float) y[i];
sum[ith] = 0.0f;
for (int i = ith; i < ne00; i += nth) {
sum[ith] += (float) x[i] * (float) y[i];
} }
// accumulate the sum from all threads in the threadgroup float all_sum = simd_sum(sumf);
threadgroup_barrier(mem_flags::mem_threadgroup); if (tiisg == 0) {
if (ith%4 == 0) { dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i]; }
}
} else {
device const half4 * x4 = (device const half4 *)x;
for (int row = 0; row < N_F16_F32; ++row) {
int r1 = rb + row;
if (r1 >= ne11) {
break;
}
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
device const float4 * y4 = (device const float4 *) y;
float sumf = 0;
for (int i = tiisg; i < ne00/4; i += 32) {
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
}
float all_sum = simd_sum(sumf);
if (tiisg == 0) {
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
} }
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%16 == 0) {
for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
} }
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith == 0) {
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
} }
// Original implementation. Left behind commented out for now
//threadgroup_barrier(mem_flags::mem_threadgroup);
//for (uint i = tptg.x/2; i > 0; i /= 2) {
// if (tpitg.x < i) {
// sum[tpitg.x] += sum[tpitg.x + i];
// }
// threadgroup_barrier(mem_flags::mem_threadgroup);
//}
//
//if (tpitg.x == 0) {
// dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
//}
} }
kernel void kernel_alibi_f32( kernel void kernel_alibi_f32(
@ -1262,7 +1329,8 @@ kernel void kernel_mul_mat_q4_K_f32(
const int r0 = tgpig.x; const int r0 = tgpig.x;
const int r1 = tgpig.y; const int r1 = tgpig.y;
const int r2 = tgpig.z; const int r2 = tgpig.z;
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
const int first_row = r0 * N_DST;
const int ib_row = first_row * nb; const int ib_row = first_row * nb;
const uint offset0 = r2/gqa*(nb*ne0); const uint offset0 = r2/gqa*(nb*ne0);
device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0; device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;

View file

@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
return; return;
} }
cl_mem mem = (cl_mem)tensor->data; cl_mem mem = (cl_mem)tensor->extra;
clReleaseMemObject(mem); clReleaseMemObject(mem);
} }
@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
size_t d_size; size_t d_size;
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0 cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted. cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
@ -1491,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
size_t d_size; size_t d_size;
cl_mem d_X; cl_mem d_X;
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
d_X = (cl_mem) src0->data; d_X = (cl_mem) src0->extra;
} else { } else {
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size); d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
} }
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
size_t d_size; size_t d_size;
cl_mem d_X; cl_mem d_X;
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
d_X = (cl_mem) src0->data; d_X = (cl_mem) src0->extra;
} else { } else {
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size); d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
} }
@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
events.emplace_back(); events.emplace_back();
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++)); CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
} else if (src0->backend == GGML_BACKEND_GPU) { } else if (src0->backend == GGML_BACKEND_GPU) {
d_Q = (cl_mem) src0->data; d_Q = (cl_mem) src0->extra;
} else { } else {
GGML_ASSERT(false); GGML_ASSERT(false);
} }
@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
CL_CHECK(clFinish(queue)); CL_CHECK(clFinish(queue));
tensor->data = dst; tensor->extra = dst;
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
} }

46
ggml.c
View file

@ -817,46 +817,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
#if !defined(__aarch64__) #if !defined(__aarch64__)
inline static uint16_t vaddvq_u8(uint8x16_t v) {
return
(uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
(uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
(uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
(uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
(uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
(uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
(uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
(uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
}
inline static int16_t vaddvq_s8(int8x16_t v) {
return
(int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
(int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
(int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
(int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
(int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
(int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
(int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
(int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
}
inline static int32_t vaddvq_s16(int16x8_t v) {
return
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
}
inline static uint32_t vaddvq_u16(uint16x8_t v) {
return
(uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
(uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
(uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
(uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
}
inline static int32_t vaddvq_s32(int32x4_t v) { inline static int32_t vaddvq_s32(int32x4_t v) {
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
} }
@ -865,12 +825,6 @@ inline static float vaddvq_f32(float32x4_t v) {
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
} }
inline static float vminvq_f32(float32x4_t v) {
return
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
}
inline static float vmaxvq_f32(float32x4_t v) { inline static float vmaxvq_f32(float32x4_t v) {
return return
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),

View file

@ -801,7 +801,7 @@ class SpecialVocab:
else: else:
continue continue
for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content): for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content):
if isinstance(maybe_token_id, int): if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
self.special_token_ids[typ] = maybe_token_id self.special_token_ids[typ] = maybe_token_id
break break
return True return True
@ -814,7 +814,7 @@ class SpecialVocab:
config = json.load(f) config = json.load(f)
for typ in self.special_token_types: for typ in self.special_token_types:
maybe_token_id = config.get(f'{typ}_token_id') maybe_token_id = config.get(f'{typ}_token_id')
if isinstance(maybe_token_id, int): if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
self.special_token_ids[typ] = maybe_token_id self.special_token_ids[typ] = maybe_token_id
return True return True

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "gguf" name = "gguf"
version = "0.3.1" version = "0.3.2"
description = "Write ML models in GGUF for GGML" description = "Write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"] authors = ["GGML <ggml@ggml.ai>"]
packages = [ packages = [

34
grammars/json_arr.gbnf Normal file
View file

@ -0,0 +1,34 @@
# This is the same as json.gbnf but we restrict whitespaces at the end of the root array
# Useful for generating JSON arrays
root ::= arr
value ::= object | array | string | number | ("true" | "false" | "null") ws
arr ::=
"[\n" ws (
value
(",\n" ws value)*
)? "]"
object ::=
"{" ws (
string ":" ws value
("," ws string ":" ws value)*
)? "}" ws
array ::=
"[" ws (
value
("," ws value)*
)? "]" ws
string ::=
"\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
# Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= ([ \t\n] ws)?

View file

@ -13,6 +13,26 @@
// //
#include <arm_neon.h> #include <arm_neon.h>
#if !defined(__aarch64__)
inline static int32_t vaddvq_s16(int16x8_t v) {
return
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
}
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
return vcombine_s16(a0, b0);
}
inline static int32_t vaddvq_s32(int32x4_t v) {
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
}
#endif
#else #else
#ifdef __wasm_simd128__ #ifdef __wasm_simd128__
@ -63,7 +83,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
float ax = fabsf(x[i]); float ax = fabsf(x[i]);
if (ax > amax) { amax = ax; max = x[i]; } if (ax > amax) { amax = ax; max = x[i]; }
} }
if (!amax) { // all zero if (amax < 1e-30f) { // all zero
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
L[i] = 0; L[i] = 0;
} }
@ -1066,6 +1086,13 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
} }
if (!max_abs_scale) {
memset(&y[i], 0, sizeof(block_q6_K));
y[i].d = ggml_fp32_to_fp16(0.f);
x += QK_K;
continue;
}
float iscale = -128.f/max_scale; float iscale = -128.f/max_scale;
y[i].d = ggml_fp32_to_fp16(1/iscale); y[i].d = ggml_fp32_to_fp16(1/iscale);
for (int ib = 0; ib < QK_K/16; ++ib) { for (int ib = 0; ib < QK_K/16; ++ib) {
@ -1302,7 +1329,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
const uint8x16_t m3 = vdupq_n_u8(0x3); const uint8x16_t m3 = vdupq_n_u8(0x3);
const uint8x16_t m4 = vdupq_n_u8(0xF); const uint8x16_t m4 = vdupq_n_u8(0xF);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t vzero = vdupq_n_s32(0); const int32x4_t vzero = vdupq_n_s32(0);
#endif
int8x16x2_t q2bytes; int8x16x2_t q2bytes;
uint8_t aux[16]; uint8_t aux[16];
@ -1608,7 +1637,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
#ifdef __ARM_NEON #ifdef __ARM_NEON
const uint8x16_t m3 = vdupq_n_u8(0x3); const uint8x16_t m3 = vdupq_n_u8(0x3);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t vzero = vdupq_n_s32(0); const int32x4_t vzero = vdupq_n_s32(0);
#endif
int8x16x4_t q2bytes; int8x16x4_t q2bytes;
@ -2592,8 +2623,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
const uint8_t * restrict q4 = x[i].qs; const uint8_t * restrict q4 = x[i].qs;
const int8_t * restrict q8 = y[i].qs; const int8_t * restrict q8 = y[i].qs;
//int32x4_t isum = mzero;
int32_t sumi1 = 0; int32_t sumi1 = 0;
int32_t sumi2 = 0; int32_t sumi2 = 0;
@ -3092,9 +3121,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
#ifdef __ARM_NEON #ifdef __ARM_NEON
const uint8x16_t m4b = vdupq_n_u8(0xf); const uint8x16_t m4b = vdupq_n_u8(0xf);
const int32x4_t mzero = vdupq_n_s32(0);
const uint8x16_t mone = vdupq_n_u8(1); const uint8x16_t mone = vdupq_n_u8(1);
const uint8x16_t mtwo = vdupq_n_u8(2); const uint8x16_t mtwo = vdupq_n_u8(2);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t mzero = vdupq_n_s32(0);
#endif
int8x16x4_t q5bytes; int8x16x4_t q5bytes;
@ -3437,8 +3468,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
#ifdef __ARM_NEON #ifdef __ARM_NEON
const uint8x16_t m4b = vdupq_n_u8(0xf); const uint8x16_t m4b = vdupq_n_u8(0xf);
const int32x4_t mzero = vdupq_n_s32(0);
const uint8x16_t mh = vdupq_n_u8(16); const uint8x16_t mh = vdupq_n_u8(16);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t mzero = vdupq_n_s32(0);
#endif
int8x16x4_t q5bytes; int8x16x4_t q5bytes;
uint8x16x4_t q5h; uint8x16x4_t q5h;
@ -3656,7 +3689,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
float sum = 0; float sum = 0;
const uint8x16_t m4b = vdupq_n_u8(0xF); const uint8x16_t m4b = vdupq_n_u8(0xF);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t vzero = vdupq_n_s32(0); const int32x4_t vzero = vdupq_n_s32(0);
#endif
//const int8x16_t m32s = vdupq_n_s8(32); //const int8x16_t m32s = vdupq_n_s8(32);
const uint8x16_t mone = vdupq_n_u8(3); const uint8x16_t mone = vdupq_n_u8(3);
@ -4045,8 +4080,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
float sum = 0; float sum = 0;
const uint8x16_t m4b = vdupq_n_u8(0xF); const uint8x16_t m4b = vdupq_n_u8(0xF);
const int32x4_t vzero = vdupq_n_s32(0);
const int8x16_t m32s = vdupq_n_s8(32); const int8x16_t m32s = vdupq_n_s8(32);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t vzero = vdupq_n_s32(0);
#endif
const uint8x16_t mone = vdupq_n_u8(3); const uint8x16_t mone = vdupq_n_u8(3);

View file

@ -325,6 +325,44 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
}, },
}, },
{
LLM_ARCH_GPT2,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
},
},
{
LLM_ARCH_GPTJ,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
},
},
{
LLM_ARCH_GPTNEOX,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_MPT,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
},
},
{
LLM_ARCH_UNKNOWN,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
},
},
}; };
static llm_arch llm_arch_from_string(const std::string & name) { static llm_arch llm_arch_from_string(const std::string & name) {
@ -1605,10 +1643,14 @@ static void llm_load_hparams(
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT)); GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
if (hparams.n_rot != hparams.n_embd / hparams.n_head) { if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head)); throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
} }
} }
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
// gpt-j n_rot = rotary_dim
}
// arch-specific KVs // arch-specific KVs
switch (model.arch) { switch (model.arch) {
@ -2900,7 +2942,12 @@ static bool llama_eval_internal(
// for big prompts, if BLAS is enabled, it is better to use only one thread // for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
// with the BLAS calls. need a better solution
if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
n_threads = std::min(4, n_threads);
}
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
@ -3324,10 +3371,16 @@ struct llm_tokenizer_bpe {
std::string byte_str(1, *j); std::string byte_str(1, *j);
auto token_multibyte = vocab.token_to_id.find(byte_str); auto token_multibyte = vocab.token_to_id.find(byte_str);
if (token_multibyte == vocab.token_to_id.end()) { if (token_multibyte == vocab.token_to_id.end()) {
try {
llama_token token_byte = llama_byte_to_token(vocab, *j);
output.push_back(token_byte);
} catch (const std::out_of_range & err) {
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str()); fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
} }
} else {
output.push_back((*token_multibyte).second); output.push_back((*token_multibyte).second);
} }
}
} else { } else {
output.push_back((*token).second); output.push_back((*token).second);
} }
@ -3802,6 +3855,25 @@ void llama_grammar_free(struct llama_grammar * grammar) {
delete grammar; delete grammar;
} }
struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
// redirect elements in stacks to point to new rules
for (size_t is = 0; is < result->stacks.size(); is++) {
for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
result->stacks[is][ie] = &result->rules[ir0][ir1];
}
}
}
}
}
return result;
}
// //
// sampling // sampling
// //
@ -5292,7 +5364,7 @@ struct llama_context_params llama_context_default_params() {
/*.seed =*/ LLAMA_DEFAULT_SEED, /*.seed =*/ LLAMA_DEFAULT_SEED,
/*.n_ctx =*/ 512, /*.n_ctx =*/ 512,
/*.n_batch =*/ 512, /*.n_batch =*/ 512,
/*.gpu_layers =*/ 0, /*.n_gpu_layers =*/ 0,
/*.main_gpu =*/ 0, /*.main_gpu =*/ 0,
/*.tensor_split =*/ nullptr, /*.tensor_split =*/ nullptr,
/*.rope_freq_base =*/ 10000.0f, /*.rope_freq_base =*/ 10000.0f,
@ -5309,6 +5381,10 @@ struct llama_context_params llama_context_default_params() {
/*.embedding =*/ false, /*.embedding =*/ false,
}; };
#ifdef GGML_USE_METAL
result.n_gpu_layers = 1;
#endif
return result; return result;
} }
@ -5501,7 +5577,6 @@ struct llama_context * llama_new_context_with_model(
} }
#endif #endif
} }
}
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
if (params.n_gpu_layers > 0) { if (params.n_gpu_layers > 0) {
@ -5538,6 +5613,7 @@ struct llama_context * llama_new_context_with_model(
#undef LLAMA_METAL_CHECK_BUF #undef LLAMA_METAL_CHECK_BUF
} }
#endif #endif
}
#ifdef GGML_USE_MPI #ifdef GGML_USE_MPI
ctx->ctx_mpi = ggml_mpi_init(); ctx->ctx_mpi = ggml_mpi_init();

View file

@ -410,6 +410,8 @@ extern "C" {
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar); LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
// //
// Sampling functions // Sampling functions
// //