Merge remote-tracking branch 'origin/master' into skip-unused-2

This commit is contained in:
ochafik 2023-09-11 23:54:36 +01:00
commit 646caf9648
65 changed files with 5744 additions and 4542 deletions

View file

@ -3,6 +3,7 @@ Checks: >
bugprone-*,
-bugprone-easily-swappable-parameters,
-bugprone-implicit-widening-of-multiplication-result,
-bugprone-misplaced-widening-cast,
-bugprone-narrowing-conversions,
readability-*,
-readability-avoid-unconditional-preprocessor-if,
@ -15,4 +16,8 @@ Checks: >
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*,
portability-*,
misc-*,
-misc-const-correctness,
-misc-non-private-member-variables-in-classes,
-misc-no-recursion,
FormatStyle: none

View file

@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip
apt-get install -y build-essential python3 python3-pip git
COPY requirements.txt requirements.txt

View file

@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
apt-get install -y build-essential
apt-get install -y build-essential git
WORKDIR /app

View file

@ -7,15 +7,12 @@ arg1="$1"
# Shift the arguments to remove the first one
shift
# Join the remaining arguments into a single string
arg2="$@"
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
python3 ./convert.py "$arg2"
python3 ./convert.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./quantize "$arg2"
./quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
./main "$arg2"
./main "$@"
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do
@ -27,7 +24,7 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
fi
done
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
./server "$arg2"
./server "$@"
else
echo "Unknown command: $arg1"
echo "Available commands: "

View file

@ -17,3 +17,6 @@ indent_style = tab
[prompts/*.txt]
insert_final_newline = unset
[examples/server/public/*]
indent_size = 2

View file

@ -18,7 +18,6 @@ on:
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GGML_NLOOP: 3
GGML_NITER: 1
GGML_N_THREADS: 1
jobs:
@ -198,6 +197,62 @@ jobs:
cd build
ctest --verbose --timeout 900
macOS-latest-cmake-ios:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
cmake -G Xcode .. \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
cmake --build . --config Release
macOS-latest-cmake-tvos:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
cmake -G Xcode .. \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=tvOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
cmake --build . --config Release
windows-latest-cmake:
runs-on: windows-latest

36
.github/workflows/code-coverage.yml vendored Normal file
View file

@ -0,0 +1,36 @@
name: Code Coverage
on: [push, pull_request]
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
jobs:
run:
runs-on: ubuntu-20.04
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential gcc-8 lcov
- name: Build
run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
- name: Run tests
run: CC=gcc-8 make test
- name: Generate coverage report
run: |
make coverage
make lcov-report
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
with:
files: lcov-report/coverage.info

37
.gitignore vendored
View file

@ -6,6 +6,10 @@
*.exe
*.dll
*.log
*.gcov
*.gcno
*.gcda
*.dot
.DS_Store
.build/
.cache/
@ -17,6 +21,9 @@
.vs/
.vscode/
lcov-report/
gcovr-report/
build*/
out/
tmp/
@ -24,27 +31,29 @@ tmp/
models/*
models-mnt
/main
/quantize
/quantize-stats
/result
/perplexity
/embedding
/train-text-from-scratch
/convert-llama2c-to-ggml
/simple
/benchmark-matmult
/vdot
/server
/Pipfile
/baby-llama
/beam-search
/benchmark-matmult
/convert-llama2c-to-ggml
/embd-input-test
/embedding
/gguf
/gguf-llama-simple
/libllama.so
/llama-bench
/baby-llama
/beam-search
/main
/metal
/perplexity
/quantize
/quantize-stats
/result
/save-load-state
/server
/simple
/speculative
/train-text-from-scratch
/vdot
build-info.h
arm_neon.h
compile_commands.json

View file

@ -36,6 +36,12 @@ endif()
# Option list
#
if (APPLE)
set(LLAMA_METAL_DEFAULT ON)
else()
set(LLAMA_METAL_DEFAULT OFF)
endif()
# general
option(LLAMA_STATIC "llama: static link libraries" OFF)
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
@ -76,7 +82,8 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF)
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
@ -159,6 +166,33 @@ if (APPLE AND LLAMA_ACCELERATE)
endif()
endif()
if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
message(STATUS "Metal framework found")
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
add_compile_definitions(GGML_USE_METAL)
if (LLAMA_METAL_NDEBUG)
add_compile_definitions(GGML_METAL_NDEBUG)
endif()
# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
)
endif()
if (LLAMA_BLAS)
if (LLAMA_STATIC)
set(BLA_STATIC ON)
@ -294,29 +328,6 @@ if (LLAMA_CUBLAS)
endif()
endif()
if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
add_compile_definitions(GGML_USE_METAL)
#add_compile_definitions(GGML_METAL_NDEBUG)
# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
)
endif()
if (LLAMA_MPI)
cmake_minimum_required(VERSION 3.10)
find_package(MPI)
@ -408,6 +419,7 @@ if (LLAMA_ALL_WARNINGS)
-Wpointer-arith
-Wmissing-prototypes
-Werror=implicit-int
-Wno-unused-function
)
set(cxx_flags
-Wall
@ -417,6 +429,10 @@ if (LLAMA_ALL_WARNINGS)
-Wno-unused-function
-Wno-multichar
)
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
# g++ only
set(cxx_flags ${cxx_flags} -Wno-format-truncation -Wno-array-bounds)
endif()
else()
# todo : msvc
endif()
@ -465,7 +481,7 @@ if (NOT MSVC)
endif()
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64"))
message(STATUS "ARM detected")
if (MSVC)
# TODO: arm msvc?
@ -540,12 +556,66 @@ else()
message(STATUS "Unknown architecture")
endif()
#
# POSIX conformance
#
# clock_gettime came in POSIX.1b (1993)
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
# posix_memalign came in POSIX.1-2001 / SUSv3
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
add_compile_definitions(_XOPEN_SOURCE=600)
# Somehow in OpenBSD whenever POSIX conformance is specified
# some string functions rely on locale_t availability,
# which was introduced in POSIX.1-2008, forcing us to go higher
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
remove_definitions(-D_XOPEN_SOURCE=600)
add_compile_definitions(_XOPEN_SOURCE=700)
endif()
# Data types, macros and functions related to controlling CPU affinity and
# some memory allocation are available on Linux through GNU extensions in libc
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
add_compile_definitions(_GNU_SOURCE)
endif()
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
# and on macOS its availability depends on enabling Darwin extensions
# similarly on DragonFly, enabling BSD extensions is necessary
if (
CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
CMAKE_SYSTEM_NAME MATCHES "iOS" OR
CMAKE_SYSTEM_NAME MATCHES "tvOS" OR
CMAKE_SYSTEM_NAME MATCHES "DragonFly"
)
add_compile_definitions(_DARWIN_C_SOURCE)
endif()
# alloca is a non-standard interface that is not visible on BSDs when
# POSIX conformance is specified, but not all of them provide a clean way
# to enable it in such cases
if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
add_compile_definitions(__BSD_VISIBLE)
endif()
if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
add_compile_definitions(_NETBSD_SOURCE)
endif()
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
add_compile_definitions(_BSD_SOURCE)
endif()
#
# libraries
#
# ggml
if (GGML_USE_CPU_HBM)
add_definitions(-DGGML_USE_CPU_HBM)
find_library(memkind memkind REQUIRED)
endif()
add_library(ggml OBJECT
ggml.c
ggml.h
@ -561,6 +631,9 @@ add_library(ggml OBJECT
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
target_compile_features(ggml PUBLIC c_std_11) # don't bump
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
if (GGML_USE_CPU_HBM)
target_link_libraries(ggml PUBLIC memkind)
endif()
add_library(ggml_static STATIC $<TARGET_OBJECTS:ggml>)
if (BUILD_SHARED_LIBS)

327
Makefile
View file

@ -1,27 +1,11 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search tests/test-c.o
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
# Binaries only useful for tests
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1
default: $(BUILD_TARGETS)
test:
@echo "Running tests..."
@for test_target in $(TEST_TARGETS); do \
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \
continue; \
else \
./$$test_target; \
fi; \
done
@echo "All tests have been run."
all: $(BUILD_TARGETS) $(TEST_TARGETS)
# Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
ifndef UNAME_S
UNAME_S := $(shell uname -s)
@ -35,12 +19,13 @@ ifndef UNAME_M
UNAME_M := $(shell uname -m)
endif
CCV := $(shell $(CC) --version | head -n 1)
CXXV := $(shell $(CXX) --version | head -n 1)
# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin)
ifndef LLAMA_NO_METAL
LLAMA_METAL := 1
endif
ifneq ($(UNAME_P),arm)
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
ifeq ($(SYSCTL_M),1)
@ -51,6 +36,60 @@ ifeq ($(UNAME_S),Darwin)
endif
endif
ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
BUILD_TARGETS += metal
endif
default: $(BUILD_TARGETS)
test: $(TEST_TARGETS)
@failures=0; \
for test_target in $(TEST_TARGETS); do \
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \
continue; \
else \
echo "Running test $$test_target..."; \
./$$test_target; \
fi; \
if [ $$? -ne 0 ]; then \
printf 'Test $$test_target FAILED!\n\n' $$test_target; \
failures=$$(( failures + 1 )); \
else \
printf 'Test %s passed.\n\n' $$test_target; \
fi; \
done; \
if [ $$failures -gt 0 ]; then \
printf '\n%s tests failed.\n' $$failures; \
exit 1; \
fi
@echo 'All tests passed.'
all: $(BUILD_TARGETS) $(TEST_TARGETS)
coverage: ## Run code coverage
gcov -pb tests/*.cpp
lcov-report: coverage ## Generate lcov report
mkdir -p lcov-report
lcov --capture --directory . --output-file lcov-report/coverage.info
genhtml lcov-report/coverage.info --output-directory lcov-report
gcovr-report: coverage ## Generate gcovr report
mkdir -p gcovr-report
gcovr --root . --html --html-details --output gcovr-report/coverage.html
ifdef RISCV_CROSS_COMPILE
CC := riscv64-unknown-linux-gnu-gcc
CXX := riscv64-unknown-linux-gnu-g++
endif
CCV := $(shell $(CC) --version | head -n 1)
CXXV := $(shell $(CXX) --version | head -n 1)
#
# Compile flags
#
@ -62,53 +101,97 @@ OPT = -Ofast
else
OPT = -O3
endif
CFLAGS = -I. $(OPT) -std=c11 -fPIC
CXXFLAGS = -I. -I./common $(OPT) -std=c++11 -fPIC
LDFLAGS =
MK_CPPFLAGS = -I. -Icommon
MK_CFLAGS = $(OPT) -std=c11 -fPIC
MK_CXXFLAGS = $(OPT) -std=c++11 -fPIC
MK_LDFLAGS =
# clock_gettime came in POSIX.1b (1993)
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
# posix_memalign came in POSIX.1-2001 / SUSv3
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
MK_CFLAGS += -D_XOPEN_SOURCE=600
MK_CXXFLAGS += -D_XOPEN_SOURCE=600
# Somehow in OpenBSD whenever POSIX conformance is specified
# some string functions rely on locale_t availability,
# which was introduced in POSIX.1-2008, forcing us to go higher
ifeq ($(UNAME_S),OpenBSD)
MK_CFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
MK_CXXFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
endif
# Data types, macros and functions related to controlling CPU affinity and
# some memory allocation are available on Linux through GNU extensions in libc
ifeq ($(UNAME_S),Linux)
MK_CFLAGS += -D_GNU_SOURCE
MK_CXXFLAGS += -D_GNU_SOURCE
endif
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
# and on macOS its availability depends on enabling Darwin extensions
# similarly on DragonFly, enabling BSD extensions is necessary
ifeq ($(UNAME_S),Darwin)
MK_CFLAGS += -D_DARWIN_C_SOURCE
MK_CXXFLAGS += -D_DARWIN_C_SOURCE
endif
ifeq ($(UNAME_S),DragonFly)
MK_CFLAGS += -D__BSD_VISIBLE
MK_CXXFLAGS += -D__BSD_VISIBLE
endif
# alloca is a non-standard interface that is not visible on BSDs when
# POSIX conformance is specified, but not all of them provide a clean way
# to enable it in such cases
ifeq ($(UNAME_S),FreeBSD)
MK_CFLAGS += -D__BSD_VISIBLE
MK_CXXFLAGS += -D__BSD_VISIBLE
endif
ifeq ($(UNAME_S),NetBSD)
MK_CFLAGS += -D_NETBSD_SOURCE
MK_CXXFLAGS += -D_NETBSD_SOURCE
endif
ifeq ($(UNAME_S),OpenBSD)
MK_CFLAGS += -D_BSD_SOURCE
MK_CXXFLAGS += -D_BSD_SOURCE
endif
ifdef LLAMA_DEBUG
CFLAGS += -O0 -g
CXXFLAGS += -O0 -g
LDFLAGS += -g
MK_CFLAGS += -O0 -g
MK_CXXFLAGS += -O0 -g
MK_LDFLAGS += -g
else
CFLAGS += -DNDEBUG
CXXFLAGS += -DNDEBUG
MK_CPPFLAGS += -DNDEBUG
endif
ifdef LLAMA_SERVER_VERBOSE
CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
endif
ifdef LLAMA_CODE_COVERAGE
MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
endif
ifdef LLAMA_DISABLE_LOGS
MK_CPPFLAGS += -DLOG_DISABLE_LOGS
endif # LLAMA_DISABLE_LOGS
# warnings
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
-Wmissing-prototypes -Werror=implicit-int
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
ifeq '' '$(findstring clang++,$(CXX))'
# g++ only
MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
endif
# OS specific
# TODO: support Windows
ifeq ($(UNAME_S),Linux)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),Darwin)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),FreeBSD)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),NetBSD)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),OpenBSD)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),Haiku)
CFLAGS += -pthread
CXXFLAGS += -pthread
ifneq '' '$(filter $(UNAME_S),Linux Darwin FreeBSD NetBSD OpenBSD Haiku)'
MK_CFLAGS += -pthread
MK_CXXFLAGS += -pthread
endif
# detect Windows
@ -134,104 +217,117 @@ ifeq ($(_WIN32),1)
endif
ifdef LLAMA_GPROF
CFLAGS += -pg
CXXFLAGS += -pg
MK_CFLAGS += -pg
MK_CXXFLAGS += -pg
endif
ifdef LLAMA_PERF
CFLAGS += -DGGML_PERF
CXXFLAGS += -DGGML_PERF
MK_CPPFLAGS += -DGGML_PERF
endif
# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue
ifndef RISCV
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
# Use all CPU extensions that are available:
CFLAGS += -march=native -mtune=native
CXXFLAGS += -march=native -mtune=native
MK_CFLAGS += -march=native -mtune=native
MK_CXXFLAGS += -march=native -mtune=native
# Usage AVX-only
#CFLAGS += -mfma -mf16c -mavx
#CXXFLAGS += -mfma -mf16c -mavx
#MK_CFLAGS += -mfma -mf16c -mavx
#MK_CXXFLAGS += -mfma -mf16c -mavx
# Usage SSSE3-only (Not is SSE3!)
#CFLAGS += -mssse3
#CXXFLAGS += -mssse3
#MK_CFLAGS += -mssse3
#MK_CXXFLAGS += -mssse3
endif
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
# https://github.com/ggerganov/llama.cpp/issues/2922
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
MK_CFLAGS += -Xassembler -muse-unaligned-vector-move
MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
endif
ifneq ($(filter aarch64%,$(UNAME_M)),)
# Apple M1, M2, etc.
# Raspberry Pi 3, 4, Zero 2 (64-bit)
CFLAGS += -mcpu=native
CXXFLAGS += -mcpu=native
MK_CFLAGS += -mcpu=native
MK_CXXFLAGS += -mcpu=native
endif
ifneq ($(filter armv6%,$(UNAME_M)),)
# Raspberry Pi 1, Zero
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
MK_CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
endif
ifneq ($(filter armv7%,$(UNAME_M)),)
# Raspberry Pi 2
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
MK_CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
endif
ifneq ($(filter armv8%,$(UNAME_M)),)
# Raspberry Pi 3, 4, Zero 2 (32-bit)
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
MK_CFLAGS += -mfp16-format=ieee -mno-unaligned-access
MK_CXXFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif
ifneq ($(filter ppc64%,$(UNAME_M)),)
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
ifneq (,$(findstring POWER9,$(POWER9_M)))
CFLAGS += -mcpu=power9
CXXFLAGS += -mcpu=power9
endif
# Require c++23's std::byteswap for big-endian support.
ifeq ($(UNAME_M),ppc64)
CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
MK_CFLAGS += -mcpu=power9
MK_CXXFLAGS += -mcpu=power9
endif
endif
else
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
endif
ifndef LLAMA_NO_K_QUANTS
CFLAGS += -DGGML_USE_K_QUANTS
CXXFLAGS += -DGGML_USE_K_QUANTS
MK_CPPFLAGS += -DGGML_USE_K_QUANTS
OBJS += k_quants.o
ifdef LLAMA_QKK_64
CFLAGS += -DGGML_QKK_64
CXXFLAGS += -DGGML_QKK_64
MK_CPPFLAGS += -DGGML_QKK_64
endif
endif
ifndef LLAMA_NO_ACCELERATE
# Mac M1 - include Accelerate framework.
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
# Mac OS - include Accelerate framework.
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
ifeq ($(UNAME_S),Darwin)
CFLAGS += -DGGML_USE_ACCELERATE
LDFLAGS += -framework Accelerate
MK_CPPFLAGS += -DGGML_USE_ACCELERATE
MK_LDFLAGS += -framework Accelerate
endif
endif # LLAMA_NO_ACCELERATE
ifdef LLAMA_MPI
CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
MK_CPPFLAGS += -DGGML_USE_MPI
MK_CFLAGS += -Wno-cast-qual
MK_CXXFLAGS += -Wno-cast-qual
OBJS += ggml-mpi.o
endif # LLAMA_MPI
ifdef LLAMA_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags openblas)
LDFLAGS += $(shell pkg-config --libs openblas)
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
MK_LDFLAGS += $(shell pkg-config --libs openblas)
endif # LLAMA_OPENBLAS
ifdef LLAMA_BLIS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
LDFLAGS += -lblis -L/usr/local/lib
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
MK_LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS
ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
OBJS += ggml-cuda.o
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
ifdef LLAMA_CUDA_NVCC
@ -282,14 +378,15 @@ endif # LLAMA_CUBLAS
ifdef LLAMA_CLBLAST
CFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
CXXFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
# Mac provides OpenCL as a framework
ifeq ($(UNAME_S),Darwin)
LDFLAGS += -lclblast -framework OpenCL
MK_LDFLAGS += -lclblast -framework OpenCL
else
LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
endif
OBJS += ggml-opencl.o
@ -304,10 +401,9 @@ ifdef LLAMA_HIPBLAS
LLAMA_CUDA_DMMV_X ?= 32
LLAMA_CUDA_MMV_Y ?= 1
LLAMA_CUDA_KQUANTS_ITER ?= 2
CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
LDFLAGS += -lhipblas -lamdhip64 -lrocblas
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
@ -322,10 +418,12 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
endif # LLAMA_HIPBLAS
ifdef LLAMA_METAL
CFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG
CXXFLAGS += -DGGML_USE_METAL
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
OBJS += ggml-metal.o
MK_CPPFLAGS += -DGGML_USE_METAL
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
OBJS += ggml-metal.o
ifdef LLAMA_METAL_NDEBUG
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
endif
endif # LLAMA_METAL
ifdef LLAMA_METAL
@ -338,7 +436,7 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI
ifdef LLAMA_NO_K_QUANTS
ifndef LLAMA_NO_K_QUANTS
k_quants.o: k_quants.c k_quants.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_NO_K_QUANTS
@ -346,12 +444,16 @@ endif # LLAMA_NO_K_QUANTS
ifndef LLAMA_NO_SKIP_UNUSED_LOGITS
CFLAGS += -DLLAMA_SKIP_UNUSED_LOGITS
CXXFLAGS += -DLLAMA_SKIP_UNUSED_LOGITS
endif
endif # LLAMA_NO_SKIP_UNUSED_LOGITS
ifdef LLAMA_DISABLE_LOGS
CFLAGS += -DLOG_DISABLE_LOGS
CXXFLAGS += -DLOG_DISABLE_LOGS
endif # LLAMA_DISABLE_LOGS
# combine build flags with cmdline overrides
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
#
# Print build information
@ -396,7 +498,7 @@ libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean:
rm -vf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h $(BUILD_TARGETS) $(TEST_TARGETS)
rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
#
# Examples
@ -454,9 +556,8 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS)
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
BUILD_TARGETS += metal
endif
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
ifdef LLAMA_METAL
metal: examples/metal/metal.cpp ggml.o $(OBJS)

View file

@ -2,8 +2,30 @@
import PackageDescription
#if arch(arm) || arch(arm64)
let platforms: [SupportedPlatform]? = [
.macOS(.v11),
.iOS(.v14),
.watchOS(.v4),
.tvOS(.v14)
]
let exclude: [String] = []
let additionalSources: [String] = ["ggml-metal.m"]
let additionalSettings: [CSetting] = [
.unsafeFlags(["-fno-objc-arc"]),
.define("GGML_SWIFT"),
.define("GGML_USE_METAL")
]
#else
let platforms: [SupportedPlatform]? = nil
let exclude: [String] = ["ggml-metal.metal"]
let additionalSources: [String] = []
let additionalSettings: [CSetting] = []
#endif
let package = Package(
name: "llama",
platforms: platforms,
products: [
.library(name: "llama", targets: ["llama"]),
],
@ -11,14 +33,23 @@ let package = Package(
.target(
name: "llama",
path: ".",
exclude: ["ggml-metal.metal"],
sources: ["ggml.c", "llama.cpp"],
exclude: exclude,
sources: [
"ggml.c",
"llama.cpp",
"ggml-alloc.c",
"k_quants.c",
] + additionalSources,
publicHeadersPath: "spm-headers",
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32"]),
.define("GGML_USE_K_QUANTS"),
.define("GGML_USE_ACCELERATE")
] + additionalSettings,
linkerSettings: [
.linkedFramework("Accelerate")
]
),
)
],
cxxLanguageStandard: .cxx11
)

View file

@ -11,21 +11,9 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
### Hot topics
- #### IMPORTANT: Tokenizer fixes and API change (developers and projects using `llama.cpp` built-in tokenization must read): https://github.com/ggerganov/llama.cpp/pull/2810
- Local Falcon 180B inference on Mac Studio
- GGUFv2 adds support for 64-bit sizes + backwards compatible: https://github.com/ggerganov/llama.cpp/pull/2821
- Added support for Falcon models: https://github.com/ggerganov/llama.cpp/pull/2717
- A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398)
Last revision compatible with the old format: [dadbed9](https://github.com/ggerganov/llama.cpp/commit/dadbed99e65252d79f81101a392d0d6497b86caa)
### Current `master` should be considered in Beta - expect some issues for a few days!
### Be prepared to re-convert and / or re-quantize your GGUF models while this notice is up!
### Issues with non-GGUF models will be considered with low priority!
https://github.com/ggerganov/llama.cpp/assets/1991296/98abd4e8-7077-464c-ae89-aebabca7757e
----
@ -114,11 +102,13 @@ as the main playground for developing new features for the [ggml](https://github
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
**UI:**
- [nat/openplayground](https://github.com/nat/openplayground)
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
- [withcatai/catai](https://github.com/withcatai/catai)
---
@ -278,29 +268,11 @@ In order to build llama.cpp you have three different options.
### Metal Build
Using Metal allows the computation to be executed on the GPU for Apple devices:
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
- Using `make`:
```bash
LLAMA_METAL=1 make
```
- Using `CMake`:
```bash
mkdir build-metal
cd build-metal
cmake -DLLAMA_METAL=ON ..
cmake --build . --config Release
```
When built with Metal support, you can enable GPU inference with the `--gpu-layers|-ngl` command-line argument.
Any value larger than 0 will offload the computation to the GPU. For example:
```bash
./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1
```
When built with Metal support, you can explicitly disable GPU inference with the `--gpu-layers|-ngl 0` command-line
argument.
### MPI Build
@ -429,7 +401,7 @@ Building the program with BLAS support may lead to some performance improvements
- #### hipBLAS
This provide BLAS acceleation on HIP supported GPU like AMD GPU.
This provides BLAS acceleration on HIP-supported AMD GPUs.
Make sure to have ROCm installed.
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
Windows support is coming soon...
@ -463,6 +435,8 @@ Building the program with BLAS support may lead to some performance improvements
You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
- For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
- For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
- <details>
<summary>Installing the OpenCL SDK from source</summary>
@ -480,10 +454,27 @@ Building the program with BLAS support may lead to some performance improvements
```
</details>
Installing CLBlast: it may be found in your operating system's packages.
##### Installing CLBlast
Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
Alternatively, they may be built from source.
- <details>
<summary>If not, then installing from source:</summary>
<summary>Windows:</summary>
```cmd
set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
git clone https://github.com/CNugteren/CLBlast.git
mkdir CLBlast\build
cd CLBlast\build
cmake .. -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
cmake --build . --config Release
cmake --install . --prefix C:/CLBlast
```
- <details>
<summary>Unix:</summary>
```sh
git clone https://github.com/CNugteren/CLBlast.git
@ -497,21 +488,32 @@ Building the program with BLAS support may lead to some performance improvements
Where `/some/path` is where the built library will be installed (default is `/usr/local`).
</details>
Building:
##### Building Llama with CLBlast
- Build with make:
```sh
make LLAMA_CLBLAST=1
```
- CMake:
- CMake (Unix):
```sh
mkdir build
cd build
cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path
cmake --build . --config Release
```
- CMake (Windows):
```cmd
set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
mkdir build
cd build
cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
cmake --build . --config Release
cmake --install . --prefix C:/LlamaCPP
```
Running:
##### Running Llama with CLBlast
The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
@ -723,12 +725,12 @@ python3 convert.py pygmalion-7b/ --outtype q4_1
- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
- Alternatively, if you want to save time and space, you can download already converted and quantized models from [TheBloke](https://huggingface.co/TheBloke), including:
- [LLaMA 2 7B base](https://huggingface.co/TheBloke/Llama-2-7B-GGML)
- [LLaMA 2 13B base](https://huggingface.co/TheBloke/Llama-2-13B-GGML)
- [LLaMA 2 70B base](https://huggingface.co/TheBloke/Llama-2-70B-GGML)
- [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML)
- [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML)
- [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGML)
- [LLaMA 2 7B base](https://huggingface.co/TheBloke/Llama-2-7B-GGUF)
- [LLaMA 2 13B base](https://huggingface.co/TheBloke/Llama-2-13B-GGUF)
- [LLaMA 2 70B base](https://huggingface.co/TheBloke/Llama-2-70B-GGUF)
- [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGUF)
- [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF)
- [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGUF)
### Verifying the model files

14
codecov.yml Normal file
View file

@ -0,0 +1,14 @@
comment: off
coverage:
status:
project:
default:
target: auto
threshold: 0
base: auto
patch:
default:
target: auto
threshold: 0
base: auto

View file

@ -24,7 +24,9 @@
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <codecvt>
#include <locale>
#include <windows.h>
@ -55,7 +57,7 @@ int32_t get_num_physical_cores() {
siblings.insert(line);
}
}
if (siblings.size() > 0) {
if (!siblings.empty()) {
return static_cast<int32_t>(siblings.size());
}
#elif defined(__APPLE__) && defined(__MACH__)
@ -303,6 +305,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.n_keep = std::stoi(argv[i]);
} else if (arg == "--draft") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_draft = std::stoi(argv[i]);
} else if (arg == "--chunks") {
if (++i >= argc) {
invalid_param = true;
@ -315,6 +323,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.model = argv[i];
} else if (arg == "-md" || arg == "--model-draft") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.model_draft = argv[i];
} else if (arg == "-a" || arg == "--alias") {
if (++i >= argc) {
invalid_param = true;
@ -570,106 +584,109 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
}
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stdout, "usage: %s [options]\n", argv[0]);
fprintf(stdout, "\n");
fprintf(stdout, "options:\n");
fprintf(stdout, " -h, --help show this help message and exit\n");
fprintf(stdout, " -i, --interactive run in interactive mode\n");
fprintf(stdout, " --interactive-first run in interactive mode and wait for input right away\n");
fprintf(stdout, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
fprintf(stdout, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
fprintf(stdout, " -r PROMPT, --reverse-prompt PROMPT\n");
fprintf(stdout, " halt generation at PROMPT, return control in interactive mode\n");
fprintf(stdout, " (can be specified more than once for multiple prompts).\n");
fprintf(stdout, " --color colorise output to distinguish prompt and user input from generations\n");
fprintf(stdout, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stdout, " -p PROMPT, --prompt PROMPT\n");
fprintf(stdout, " prompt to start generation with (default: empty)\n");
fprintf(stdout, " -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
fprintf(stdout, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
fprintf(stdout, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
fprintf(stdout, " not supported with --interactive or other interactive options\n");
fprintf(stdout, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
fprintf(stdout, " --random-prompt start with a randomized prompt.\n");
fprintf(stdout, " --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n");
fprintf(stdout, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
fprintf(stdout, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
fprintf(stdout, " -f FNAME, --file FNAME\n");
fprintf(stdout, " prompt file to start generation.\n");
fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
fprintf(stdout, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
fprintf(stdout, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
fprintf(stdout, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
fprintf(stdout, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
fprintf(stdout, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
fprintf(stdout, " --mirostat N use Mirostat sampling.\n");
fprintf(stdout, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
fprintf(stdout, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
fprintf(stdout, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
fprintf(stdout, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
fprintf(stdout, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
fprintf(stdout, " modifies the likelihood of token appearing in the completion,\n");
fprintf(stdout, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
fprintf(stdout, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
fprintf(stdout, " --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
fprintf(stdout, " --grammar-file FNAME file to read grammar from\n");
fprintf(stdout, " --cfg-negative-prompt PROMPT\n");
fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n");
fprintf(stdout, " --cfg-negative-prompt-file FNAME\n");
fprintf(stdout, " negative prompt file to use for guidance. (default: empty)\n");
fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
fprintf(stdout, " --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
fprintf(stdout, " --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
fprintf(stdout, " --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
fprintf(stdout, " --no-penalize-nl do not penalize newline token\n");
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n");
fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp);
fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n");
fprintf(stdout, " --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
fprintf(stdout, " --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
fprintf(stdout, " --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
printf("usage: %s [options]\n", argv[0]);
printf("\n");
printf("options:\n");
printf(" -h, --help show this help message and exit\n");
printf(" -i, --interactive run in interactive mode\n");
printf(" --interactive-first run in interactive mode and wait for input right away\n");
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
printf(" -r PROMPT, --reverse-prompt PROMPT\n");
printf(" halt generation at PROMPT, return control in interactive mode\n");
printf(" (can be specified more than once for multiple prompts).\n");
printf(" --color colorise output to distinguish prompt and user input from generations\n");
printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
printf(" -p PROMPT, --prompt PROMPT\n");
printf(" prompt to start generation with (default: empty)\n");
printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
printf(" --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
printf(" --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
printf(" not supported with --interactive or other interactive options\n");
printf(" --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
printf(" --random-prompt start with a randomized prompt.\n");
printf(" --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n");
printf(" --in-prefix STRING string to prefix user inputs with (default: empty)\n");
printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
printf(" -f FNAME, --file FNAME\n");
printf(" prompt file to start generation.\n");
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
printf(" --mirostat N use Mirostat sampling.\n");
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
printf(" modifies the likelihood of token appearing in the completion,\n");
printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
printf(" --grammar-file FNAME file to read grammar from\n");
printf(" --cfg-negative-prompt PROMPT\n");
printf(" negative prompt to use for guidance. (default: empty)\n");
printf(" --cfg-negative-prompt-file FNAME\n");
printf(" negative prompt file to use for guidance. (default: empty)\n");
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
printf(" --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
printf(" --no-penalize-nl do not penalize newline token\n");
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
printf(" --perplexity compute perplexity over each ctx window of the prompt\n");
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
if (llama_mlock_supported()) {
fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
}
if (llama_mmap_supported()) {
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
}
fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n");
fprintf(stdout, " if run without this previously, it is recommended to drop the system page cache before using this\n");
fprintf(stdout, " see https://github.com/ggerganov/llama.cpp/issues/1437\n");
printf(" --numa attempt optimizations that help on some NUMA systems\n");
printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stdout, " -ngl N, --n-gpu-layers N\n");
fprintf(stdout, " number of layers to store in VRAM\n");
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
printf(" -ngl N, --n-gpu-layers N\n");
printf(" number of layers to store in VRAM\n");
printf(" -ts SPLIT --tensor-split SPLIT\n");
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n");
#ifdef GGML_USE_CUBLAS
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
fprintf(stdout, " use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
printf(" -nommq, --no-mul-mat-q\n");
printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
printf(" Not recommended since this is both slower and uses more VRAM.\n");
#endif // GGML_USE_CUBLAS
#endif
fprintf(stdout, " --mtest compute maximum memory usage\n");
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");
fprintf(stdout, " --verbose-prompt print prompt before generation\n");
printf(" --mtest compute maximum memory usage\n");
printf(" --export export the computation graph to 'llama.ggml'\n");
printf(" --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stdout, " -m FNAME, --model FNAME\n");
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
fprintf(stdout, " -ld LOGDIR, --logdir LOGDIR\n");
fprintf(stdout, " path under which to save YAML logs (no logging if unset)\n");
fprintf(stdout, "\n");
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
printf(" -m FNAME, --model FNAME\n");
printf(" model path (default: %s)\n", params.model.c_str());
printf(" -md FNAME, --model-draft FNAME\n");
printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
printf(" -ld LOGDIR, --logdir LOGDIR\n");
printf(" path under which to save YAML logs (no logging if unset)\n");
printf("\n");
}
std::string gpt_random_prompt(std::mt19937 & rng) {
@ -700,7 +717,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
lparams.n_ctx = params.n_ctx;
lparams.n_batch = params.n_batch;
lparams.n_gpu_layers = params.n_gpu_layers;
if (params.n_gpu_layers != -1) {
lparams.n_gpu_layers = params.n_gpu_layers;
}
lparams.main_gpu = params.main_gpu;
lparams.tensor_split = params.tensor_split;
lparams.low_vram = params.low_vram;
@ -750,6 +769,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
}
{
LOG("warming up the model with an empty run\n");
const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads);
llama_reset_timings(lctx);
}
return std::make_tuple(model, lctx);
}
@ -822,6 +849,130 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
return result;
}
//
// Sampling utils
//
llama_token llama_sample_token(
struct llama_context * ctx,
struct llama_context * ctx_guidance,
struct llama_grammar * grammar,
const struct gpt_params & params,
const std::vector<llama_token> & last_tokens,
std::vector<llama_token_data> & candidates,
int idx) {
const int n_ctx = llama_n_ctx(ctx);
const int n_vocab = llama_n_vocab(ctx);
const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
const float top_p = params.top_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
const float repeat_penalty = params.repeat_penalty;
const float alpha_presence = params.presence_penalty;
const float alpha_frequency = params.frequency_penalty;
const int mirostat = params.mirostat;
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
const bool penalize_nl = params.penalize_nl;
llama_token id = 0;
float * logits = llama_get_logits(ctx) + idx * n_vocab;
// Apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
candidates.clear();
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
if (ctx_guidance) {
llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
}
// apply penalties
if (!last_tokens.empty()) {
const float nl_logit = logits[llama_token_nl(ctx)];
const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
llama_sample_repetition_penalty(ctx, &cur_p,
last_tokens.data() + last_tokens.size() - last_n_repeat,
last_n_repeat, repeat_penalty);
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
last_tokens.data() + last_tokens.size() - last_n_repeat,
last_n_repeat, alpha_frequency, alpha_presence);
if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
cur_p.data[idx].logit = nl_logit;
break;
}
}
}
}
if (grammar != NULL) {
llama_sample_grammar(ctx, &cur_p, grammar);
}
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx, &cur_p);
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temperature(ctx, &cur_p, temp);
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temperature(ctx, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
llama_sample_top_k (ctx, &cur_p, top_k, 1);
llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
llama_sample_typical (ctx, &cur_p, typical_p, 1);
llama_sample_top_p (ctx, &cur_p, top_p, 1);
llama_sample_temperature(ctx, &cur_p, temp);
{
const int n_top = 10;
LOG("top %d candidates:\n", n_top);
for (int i = 0; i < n_top; i++) {
const llama_token id = cur_p.data[i].id;
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
}
}
id = llama_sample_token(ctx, &cur_p);
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
}
}
// printf("`%d`", candidates_p.size);
if (grammar != NULL) {
llama_grammar_accept_token(ctx, grammar, id);
}
return id;
}
//
// YAML utils
//
// returns true if successful, false otherwise
bool create_directory_with_parents(const std::string & path) {
#ifdef _WIN32
@ -1027,7 +1178,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
fprintf(stream, "hellaswag_tasks: %ld # default: 400\n", params.hellaswag_tasks);
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
@ -1060,9 +1211,10 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
fprintf(stream, "n_gpu_layers: %d # default: 0\n", params.n_gpu_layers);
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");

View file

@ -20,6 +20,9 @@
#define DIRECTORY_SEPARATOR '/'
#endif // _WIN32
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", ##__VA_ARGS__); exit(1); } while (0)
//
// CLI argument parsing
//
@ -32,8 +35,9 @@ struct gpt_params {
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
@ -63,6 +67,7 @@ struct gpt_params {
float cfg_scale = 1.f; // How strong is guidance
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias
std::string prompt = "";
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
@ -156,6 +161,40 @@ std::string llama_detokenize_bpe(
llama_context * ctx,
const std::vector<llama_token> & tokens);
//
// Sampling utils
//
// this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function
//
// required:
// - ctx: context to use for sampling
// - params: sampling parameters
//
// optional:
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
// - grammar: grammar to use for sampling, ignore if NULL
// - last_tokens: needed for repetition penalty, ignore if empty
// - idx: sample from llama_get_logits(ctx) + idx * n_vocab
//
// returns:
// - token: sampled token
// - candidates: vector of candidate tokens
//
llama_token llama_sample_token(
struct llama_context * ctx,
struct llama_context * ctx_guidance,
struct llama_grammar * grammar,
const struct gpt_params & params,
const std::vector<llama_token> & last_tokens,
std::vector<llama_token_data> & candidates,
int idx = 0);
//
// YAML utils
//
bool create_directory_with_parents(const std::string & path);
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);

View file

@ -235,6 +235,7 @@ namespace console {
int estimateWidth(char32_t codepoint) {
#if defined(_WIN32)
(void)codepoint;
return 1;
#else
return wcwidth(codepoint);

View file

@ -415,6 +415,7 @@ namespace grammar_parser {
std::vector<const llama_grammar_element *> parse_state::c_rules() {
std::vector<const llama_grammar_element *> ret;
ret.reserve(rules.size());
for (const auto & rule : rules) {
ret.push_back(rule.data());
}

View file

@ -154,7 +154,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
// #include "log.h"
//
#ifndef LOG_NO_TIMESTAMPS
#ifndef _WIN32
#ifndef _MSC_VER
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#else
@ -167,7 +167,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
#endif
#ifdef LOG_TEE_TIMESTAMPS
#ifndef _WIN32
#ifndef _MSC_VER
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#else
@ -187,7 +187,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
// #include "log.h"
//
#ifndef LOG_NO_FILE_LINE_FUNCTION
#ifndef _WIN32
#ifndef _MSC_VER
#define LOG_FLF_FMT "[%24s:%5d][%24s] "
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else
@ -200,7 +200,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
#endif
#ifdef LOG_TEE_FILE_LINE_FUNCTION
#ifndef _WIN32
#ifndef _MSC_VER
#define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else
@ -224,7 +224,7 @@ enum LogTriState
// INTERNAL, DO NOT USE
// USE LOG() INSTEAD
//
#ifndef _WIN32
#ifndef _MSC_VER
#define LOG_IMPL(str, ...) \
{ \
if (LOG_TARGET != nullptr) \
@ -247,7 +247,7 @@ enum LogTriState
// INTERNAL, DO NOT USE
// USE LOG_TEE() INSTEAD
//
#ifndef _WIN32
#ifndef _MSC_VER
#define LOG_TEE_IMPL(str, ...) \
{ \
if (LOG_TARGET != nullptr) \
@ -284,7 +284,7 @@ enum LogTriState
// Main LOG macro.
// behaves like printf, and supports arguments the exact same way.
//
#ifndef _WIN32
#ifndef _MSC_VER
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
#else
#define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "")
@ -298,14 +298,14 @@ enum LogTriState
// Secondary target can be changed just like LOG_TARGET
// by defining LOG_TEE_TARGET
//
#ifndef _WIN32
#ifndef _MSC_VER
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
#else
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "")
#endif
// LOG macro variants with auto endline.
#ifndef _WIN32
#ifndef _MSC_VER
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
#else
@ -341,14 +341,14 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri
}
}
if (_disabled)
{
// Log is disabled
return nullptr;
}
if (_initialized)
{
if (_disabled)
{
// Log is disabled
return nullptr;
}
// with fallback in case something went wrong
return logfile ? logfile : stderr;
}
@ -461,7 +461,7 @@ inline void log_test()
LOG("13 Hello World this time in yet new file?\n")
log_set_target(log_filename_generator("llama_autonamed", "log"));
LOG("14 Hello World in log with generated filename!\n")
#ifdef _WIN32
#ifdef _MSC_VER
LOG_TEE("15 Hello msvc TEE without arguments\n")
LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test")
LOG_TEELN("17 Hello msvc TEELN without arguments\n")
@ -513,16 +513,16 @@ inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string &
inline void log_print_usage()
{
fprintf(stdout, "log options:\n");
printf("log options:\n");
/* format
fprintf(stdout, " -h, --help show this help message and exit\n");*/
printf(" -h, --help show this help message and exit\n");*/
/* spacing
fprintf(stdout, "__-param----------------Description\n");*/
fprintf(stdout, " --log-test Run simple logging test\n");
fprintf(stdout, " --log-disable Disable trace logs\n");
fprintf(stdout, " --log-enable Enable trace logs\n");
fprintf(stdout, " --log-file Specify a log filename (without extension)\n");
fprintf(stdout, " Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */
printf("__-param----------------Description\n");*/
printf(" --log-test Run simple logging test\n");
printf(" --log-disable Disable trace logs\n");
printf(" --log-enable Enable trace logs\n");
printf(" --log-file Specify a log filename (without extension)\n");
printf(" Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */
}
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)

View file

@ -1,18 +1,24 @@
#!/usr/bin/env python3
# HF falcon--> gguf conversion
import gguf
import os
import sys
import struct
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import Any
import numpy as np
import torch
import argparse
from transformers import AutoTokenizer # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
from typing import Any, List
from pathlib import Path
from transformers import AutoTokenizer
def bytes_to_unicode():
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
@ -49,10 +55,10 @@ def count_model_parts(dir_model: Path) -> int:
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
return parser.parse_args()
args = parse_args()
@ -114,9 +120,9 @@ gguf_writer.add_file_type(ftype)
print("gguf: get tokenizer metadata")
tokens: List[bytearray] = []
scores: List[float] = []
toktypes: List[int] = []
tokens: list[bytearray] = []
scores: list[float] = []
toktypes: list[int] = []
tokenizer_json_file = dir_model / 'tokenizer.json'
if not tokenizer_json_file.is_file():

View file

@ -1,18 +1,23 @@
#!/usr/bin/env python3
# HF gptneox--> gguf conversion
import gguf
import os
import sys
import struct
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import Any
import numpy as np
import torch
import argparse
from transformers import AutoTokenizer # type: ignore[import]
from typing import Any, List
from pathlib import Path
from transformers import AutoTokenizer
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
@ -112,7 +117,7 @@ gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
print("gguf: get tokenizer metadata")
tokens: List[bytearray] = []
tokens: list[bytearray] = []
tokenizer_json_file = dir_model / 'tokenizer.json'
if not tokenizer_json_file.is_file():

View file

@ -1,258 +0,0 @@
#!/usr/bin/env python3
# 7b pth llama --> gguf conversion
# Only models with a single datafile are supported, like 7B
# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
import gguf
import os
import sys
import struct
import json
import numpy as np
import torch
import argparse
from typing import Any, List, TypeAlias
from pathlib import Path
from sentencepiece import SentencePieceProcessor
#NDArray = np.ndarray[Any, Any]
# compatible with python < 3.9
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("consolidated."):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a PyTorch 7B LLaMA model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "LlamaForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit()
# get number of model parts
num_parts = count_model_parts(dir_model)
if num_parts > 1:
print("gguf: Only models with a single datafile are supported.")
sys.exit()
ARCH=gguf.MODEL_ARCH.LLAMA
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["num_hidden_layers"]
head_count = hparams["num_attention_heads"]
if "num_key_value_heads" in hparams:
head_count_kv = hparams["num_key_value_heads"]
else:
head_count_kv = head_count
if "_name_or_path" in hparams:
hf_repo = hparams["_name_or_path"]
else:
hf_repo = ""
if "max_sequence_length" in hparams:
ctx_length = hparams["max_sequence_length"]
elif "max_position_embeddings" in hparams:
ctx_length = hparams["max_position_embeddings"]
else:
print("gguf: can not find ctx length parameter.")
sys.exit()
gguf_writer.add_name(dir_model.name)
gguf_writer.add_source_hf_repo(hf_repo)
gguf_writer.add_tensor_data_layout("Meta AI original pth")
gguf_writer.add_context_length(ctx_length)
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
if "type" in hparams["rope_scaling"]:
if hparams["rope_scaling"]["type"] == "linear":
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: List[bytes] = []
scores: List[float] = []
toktypes: List[int] = []
tokenizer_model_file = dir_model / 'tokenizer.model'
if not tokenizer_model_file.is_file():
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
sys.exit(1)
# vocab type sentencepiece
print("gguf: get sentencepiece tokenizer vocab and scores")
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
for i in range(tokenizer.vocab_size()):
text: bytes
score: float
piece = tokenizer.id_to_piece(i)
text = piece.encode("utf-8")
score = tokenizer.get_score(i)
toktype = 1 # defualt to normal token type
if tokenizer.is_unknown(i):
toktype = 2
if tokenizer.is_control(i):
toktype = 3
# toktype = 4 is user-defined = tokens from added_tokens.json
if tokenizer.is_unused(i):
toktype = 5
if tokenizer.is_byte(i):
toktype = 6
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
added_tokens_file = dir_model / 'added_tokens.json'
if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f:
addtokens_json = json.load(f)
print("gguf: get added tokens")
for key in addtokens_json:
tokens.append( key.encode("utf-8") )
scores.append(-1000.0)
toktypes.append(4) # user-defined token type
gguf_writer.add_tokenizer_model("llama")
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# tensor info
print("gguf: get tensor metadata")
part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts))
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
for name in model_part.keys():
data = model_part[name]
# we don't need these
if name == "rope.freqs":
continue
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View file

@ -1,9 +1,18 @@
#!/usr/bin/env python3
import sys, struct, math, argparse
from __future__ import annotations
import argparse
import math
import struct
import sys
from enum import IntEnum
from pathlib import Path
import numpy as np
import os
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
# Note: Does not support GGML_QKK_64
@ -26,10 +35,35 @@ GGML_QUANT_SIZES = {
gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
}
class GGMLFormat(IntEnum):
GGML = 0
GGMF = 1
GGJT = 2
class GGMLFType(IntEnum):
ALL_F32 = 0
MOSTLY_F16 = 1
MOSTLY_Q4_0 = 2
MOSTLY_Q4_1 = 3
MOSTLY_Q4_1_SOME_F16 = 4
MOSTLY_Q8_0 = 7
MOSTLY_Q5_0 = 8
MOSTLY_Q5_1 = 9
MOSTLY_Q2_K = 10
MOSTLY_Q3_K_S = 11
MOSTLY_Q3_K_M = 12
MOSTLY_Q3_K_L = 13
MOSTLY_Q4_K_S = 14
MOSTLY_Q4_K_M = 15
MOSTLY_Q5_K_S = 16
MOSTLY_Q5_K_M = 17
MOSTLY_Q6_K = 18
class Hyperparameters:
def __init__(self):
self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0
self.n_ff = 0
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
self.n_layer = self.n_rot = self.n_ff = 0
self.ftype = GGMLFType.ALL_F32
def set_n_ff(self, model):
ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
@ -45,16 +79,21 @@ class Hyperparameters:
self.n_head,
self.n_layer,
self.n_rot,
self.ftype,
ftype,
) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
try:
self.ftype = GGMLFType(ftype)
except ValueError:
raise ValueError(f'Invalid ftype {ftype}')
return 4 * 7
def __str__(self):
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>'
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
class Vocab:
def __init__(self):
def __init__(self, load_scores = True):
self.items = []
self.load_scores = load_scores
def load(self, data, offset, n_vocab):
orig_offset = offset
@ -62,20 +101,24 @@ class Vocab:
itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
assert itemlen < 4096, 'Absurd vocab item length'
offset += 4
vocab = bytes(data[offset:offset + itemlen])
item_text = bytes(data[offset:offset + itemlen])
offset += itemlen
score = struct.unpack('<f', data[offset:offset + 4])[0]
offset += 4
self.items.append((vocab, score))
if self.load_scores:
item_score = struct.unpack('<f', data[offset:offset + 4])[0]
offset += 4
else:
item_score = 0.0
self.items.append((item_text, item_score))
return offset - orig_offset
class Tensor:
def __init__(self):
def __init__(self, use_padding = True):
self.name = None
self.dims = ()
self.dims: tuple[int, ...] = ()
self.dtype = None
self.start_offset = 0
self.len_bytes = np.int64(0)
self.use_padding = use_padding
def load(self, data, offset):
orig_offset = offset
@ -91,7 +134,7 @@ class Tensor:
offset += 4 * n_dims
self.name = bytes(data[offset:offset + name_len])
offset += name_len
pad = ((offset + 31) & ~31) - offset
pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
offset += pad
n_elems = np.prod(self.dims)
n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
@ -101,7 +144,7 @@ class Tensor:
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
return offset - orig_offset
class GGMLV3Model:
class GGMLModel:
def __init__(self):
self.hyperparameters = None
self.vocab = None
@ -109,20 +152,52 @@ class GGMLV3Model:
self.tensors = []
def validate_header(self, data, offset):
if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3:
raise ValueError('Only GGJTv3 supported')
return 8
magic = bytes(data[offset:offset + 4])
if magic == b'GGUF':
raise ValueError('File is already in GGUF format.')
if magic == b'lmgg':
self.file_format = GGMLFormat.GGML
self.format_version = 1
return 4
version = struct.unpack('<I', data[offset + 4:offset + 8])[0]
if magic == b'fmgg':
if version != 1:
raise ValueError(f'Cannot handle unexpected GGMF file version {version}')
self.file_format = GGMLFormat.GGMF
self.format_version = version
return 8
if magic == b'tjgg':
if version < 1 or version > 3:
raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
self.file_format = GGMLFormat.GGJT
self.format_version = version
return 8
raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
def validate_conversion(self, ftype):
err = ''
if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
if len(err) > 0:
raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
def load(self, data, offset):
offset += self.validate_header(data, offset)
hp = Hyperparameters()
offset += hp.load(data, offset)
vocab = Vocab()
print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
self.validate_conversion(hp.ftype)
vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
offset += vocab.load(data, offset, hp.n_vocab)
tensors = []
tensors: list[Tensor] = []
tensor_map = {}
while offset < len(data):
tensor = Tensor()
tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
offset += tensor.load(data, offset)
tensor_map[tensor.name] = len(tensors)
tensors.append(tensor)
@ -160,7 +235,10 @@ class GGMLToGGUF:
def save(self):
print('* Preparing to save GGUF file')
gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
gguf_writer = gguf.GGUFWriter(
self.cfg.output,
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
use_temp_file = False )
self.add_params(gguf_writer)
self.add_vocab(gguf_writer)
if self.special_vocab is not None:
@ -177,7 +255,10 @@ class GGMLToGGUF:
def add_params(self, gguf_writer):
hp = self.model.hyperparameters
cfg = self.cfg
desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format'
if cfg.desc is not None:
desc = cfg.desc
else:
desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
try:
# Filenames aren't necessarily valid UTF8.
name = cfg.name if cfg.name is not None else cfg.input.name
@ -187,6 +268,7 @@ class GGMLToGGUF:
if name is not None:
gguf_writer.add_name(name)
gguf_writer.add_description(desc)
gguf_writer.add_file_type(int(hp.ftype))
if self.params_override is not None:
po = self.params_override
assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
@ -223,7 +305,8 @@ class GGMLToGGUF:
tokens.append(vbytes)
scores.append(score)
toktypes.append(ttype)
assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
assert len(tokens) == hp.n_vocab, \
f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
if len(toktypes) > 0:
@ -275,7 +358,11 @@ class GGMLToGGUF:
tempdims[1] = tempdims[0]
tempdims[0] = temp
# print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype)
gguf_writer.add_tensor(
mapped_name,
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
raw_shape = tempdims,
raw_dtype = tensor.dtype )
def handle_metadata(cfg, hp):
import convert
@ -297,32 +384,46 @@ def handle_metadata(cfg, hp):
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
else:
raise ValueError('Unable to load metadata')
vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
vocab = convert.load_vocab(
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
cfg.vocabtype )
# FIXME: Respect cfg.vocab_dir?
svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
convert.check_vocab_size(params, vocab)
return (params, vocab, svocab)
def handle_args():
parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
parser.add_argument('--input', '-i', type = Path, help = 'Input GGMLv3 filename')
parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename')
parser.add_argument('--name', help = 'Set model name')
parser.add_argument('--desc', help = 'Set model description')
parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm")
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
parser.add_argument('--input', '-i', type = Path, required = True,
help = 'Input GGMLv3 filename')
parser.add_argument('--output', '-o', type = Path, required = True,
help ='Output GGUF filename')
parser.add_argument('--name',
help = 'Set model name')
parser.add_argument('--desc',
help = 'Set model description')
parser.add_argument('--gqa', type = int, default = 1,
help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
parser.add_argument('--eps', default = '5.0e-06',
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
parser.add_argument('--context-length', '-c', type=int, default = 2048,
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
parser.add_argument('--model-metadata-dir', '-m', type = Path,
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
parser.add_argument("--vocab-dir", type=Path,
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
return parser.parse_args()
def main():
cfg = handle_args()
print(f'* Using config: {cfg}')
print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
data = np.memmap(cfg.input, mode = 'r')
model = GGMLV3Model()
model = GGMLModel()
print('* Scanning GGML input file')
offset = model.load(data, 0)
print(f'* GGML model hyperparameters: {model.hyperparameters}')
@ -337,7 +438,12 @@ def main():
print(f'* Special vocab: {special_vocab}')
else:
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override, special_vocab = special_vocab)
if model.file_format == GGMLFormat.GGML:
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
converter = GGMLToGGUF(model, data, cfg,
params_override = params_override,
vocab_override = vocab_override,
special_vocab = special_vocab )
converter.save()
print(f'* Successful completion. Output saved to: {cfg.output}')

View file

@ -1,277 +0,0 @@
#!/usr/bin/env python3
# HF llama --> gguf conversion
import gguf
import os
import sys
import struct
import json
import numpy as np
import torch
import argparse
from typing import Any, List, Optional, TypeAlias
from pathlib import Path
from sentencepiece import SentencePieceProcessor
#NDArray = np.ndarray[Any, Any]
# compatible with python < 3.9
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
# reverse HF permute back to original pth layout
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
def count_model_parts(dir_model: str) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "LlamaForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit()
# get number of model parts
num_parts = count_model_parts(dir_model)
ARCH=gguf.MODEL_ARCH.LLAMA
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["num_hidden_layers"]
head_count = hparams["num_attention_heads"]
if "num_key_value_heads" in hparams:
head_count_kv = hparams["num_key_value_heads"]
else:
head_count_kv = head_count
if "_name_or_path" in hparams:
hf_repo = hparams["_name_or_path"]
else:
hf_repo = ""
if "max_sequence_length" in hparams:
ctx_length = hparams["max_sequence_length"]
elif "max_position_embeddings" in hparams:
ctx_length = hparams["max_position_embeddings"]
else:
print("gguf: can not find ctx length parameter.")
sys.exit()
gguf_writer.add_name(dir_model.name)
gguf_writer.add_source_hf_repo(hf_repo)
gguf_writer.add_tensor_data_layout("Meta AI original pth")
gguf_writer.add_context_length(ctx_length)
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
if "type" in hparams["rope_scaling"]:
if hparams["rope_scaling"]["type"] == "linear":
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: List[bytes] = []
scores: List[float] = []
toktypes: List[int] = []
tokenizer_model_file = dir_model / 'tokenizer.model'
if not tokenizer_model_file.is_file():
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
sys.exit(1)
# vocab type sentencepiece
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
for i in range(tokenizer.vocab_size()):
text: bytes
score: float
piece = tokenizer.id_to_piece(i)
text = piece.encode("utf-8")
score = tokenizer.get_score(i)
toktype = 1 # defualt to normal token type
if tokenizer.is_unknown(i):
toktype = 2
if tokenizer.is_control(i):
toktype = 3
# toktype = 4 is user-defined = tokens from added_tokens.json
if tokenizer.is_unused(i):
toktype = 5
if tokenizer.is_byte(i):
toktype = 6
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
added_tokens_file = dir_model / 'added_tokens.json'
if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f:
addtokens_json = json.load(f)
print("gguf: get added tokens")
for key in addtokens_json:
tokens.append( key.encode("utf-8") )
scores.append(-1000.0)
toktypes.append(4) # user-defined token type
gguf_writer.add_tokenizer_model("llama")
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
for name in model_part.keys():
data = model_part[name]
# we don't need these
if name.endswith(".rotary_emb.inv_freq"):
continue
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# reverse permute these
if name.endswith(".q_proj.weight"):
data = reverse_hf_permute(data, head_count)
if name.endswith(".k_proj.weight"):
data = reverse_hf_permute(data, head_count, head_count_kv)
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View file

@ -1,15 +1,17 @@
#!/usr/bin/env python3
from __future__ import annotations
import json
import os
import re
import struct
import sys
from typing import Any, Dict, Sequence, BinaryIO
from typing import Any, BinaryIO, Sequence
import numpy as np
import torch
NUMPY_TYPE_TO_FTYPE: Dict[str, int] = {"float32": 0, "float16": 1}
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
HF_SUBLAYER_TO_GGML = {
@ -46,7 +48,7 @@ def translate_tensor_name(t: str) -> str:
sys.exit(1)
def write_file_header(fout: BinaryIO, params: Dict[str, Any]) -> None:
def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
fout.write(b"ggla"[::-1]) # magic (ggml lora)
fout.write(struct.pack("i", 1)) # file version
fout.write(struct.pack("i", params["r"]))

View file

@ -1,9 +1,8 @@
#!/usr/bin/env python3
from __future__ import annotations
import gguf
import argparse
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import copy
import enum
import faulthandler
@ -20,21 +19,27 @@ import struct
import sys
import time
import zipfile
import numpy as np
from abc import ABCMeta, abstractmethod
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable, List, Literal, Optional, Sequence, Set, Tuple, Type, TypeVar, Union)
from sentencepiece import SentencePieceProcessor # type: ignore
from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar
import numpy as np
from sentencepiece import SentencePieceProcessor # type: ignore[import]
import os
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
if TYPE_CHECKING:
from typing_extensions import TypeAlias
from typing import TypeAlias
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
faulthandler.register(signal.SIGUSR1)
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
ARCH=gguf.MODEL_ARCH.LLAMA
NAMES=gguf.MODEL_TENSOR_NAMES[ARCH]
@ -47,8 +52,8 @@ DEFAULT_CONCURRENCY = 8
@dataclass(frozen=True)
class DataType:
name: str
dtype: 'np.dtype[Any]'
valid_conversions: List[str]
dtype: np.dtype[Any]
valid_conversions: list[str]
def elements_to_bytes(self, n_elements: int) -> int:
return n_elements * self.dtype.itemsize
@ -65,7 +70,7 @@ DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_convers
@dataclass(frozen=True)
class QuantizedDataType(DataType):
block_size: int
quantized_dtype: 'np.dtype[Any]'
quantized_dtype: np.dtype[Any]
ggml_type: gguf.GGMLQuantizationType
def quantize(self, arr: NDArray) -> NDArray:
@ -84,7 +89,7 @@ class Q8_0QuantizedDataType(QuantizedDataType):
n_blocks = arr.size // self.block_size
blocks = arr.reshape((n_blocks, self.block_size))
# Much faster implementation of block quantization contributed by @Cebtenzzre
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[Tuple[Any, Any]]:
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
d = abs(blocks).max(axis = 1) / np.float32(127)
with np.errstate(divide = 'ignore'):
qs = (blocks / d[:, None]).round()
@ -98,13 +103,13 @@ DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
# Quantized types skipped here because they may also map to np.float32
NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = {}
NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
for dt in (DT_BF16, DT_F16, DT_F32, DT_I32):
if dt.dtype in NUMPY_TYPE_TO_DATA_TYPE:
raise ValueError(f'Invalid duplicate data type {dt}')
NUMPY_TYPE_TO_DATA_TYPE[dt.dtype] = dt
SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
'BF16': DT_BF16,
'F16': DT_F16,
'F32': DT_F32,
@ -119,14 +124,14 @@ class GGMLFileType(enum.IntEnum):
MostlyF16 = 1 # except 1d tensors
MostlyQ8_0 = 7 # except 1d tensors
def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType:
def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType:
dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
if dt is None:
raise ValueError(self)
# 1D tensors are always F32.
return dt if len(tensor.shape) > 1 else DT_F32
GGML_FILE_TYPE_TO_DATA_TYPE: Dict[GGMLFileType, DataType] = {
GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
GGMLFileType.AllF32 : DT_F32,
GGMLFileType.MostlyF16 : DT_F16,
GGMLFileType.MostlyQ8_0: DT_Q8_0,
@ -140,7 +145,6 @@ GGML_FILE_TYPE_TO_DATA_TYPE: Dict[GGMLFileType, DataType] = {
class Params:
n_vocab: int
n_embd: int
n_mult: int
n_layer: int
n_ctx: int
n_ff: int
@ -148,25 +152,16 @@ class Params:
n_head_kv: int
f_norm_eps: float
f_rope_freq_base: Optional[float] = None
f_rope_scale: Optional[float] = None
f_rope_freq_base: float | None = None
f_rope_scale: float | None = None
ftype: Optional[GGMLFileType] = None
ftype: GGMLFileType | None = None
# path to the directory containing the model files
path_model: Optional['Path'] = None
path_model: Path | None = None
@staticmethod
def find_n_mult(n_ff: int, n_embd: int) -> int:
# hardcoded magic range
for n_mult in range(8192, 1, -1):
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
if calc_ff == n_ff:
return n_mult
raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
@staticmethod
def guessed(model: 'LazyModel') -> 'Params':
def guessed(model: LazyModel) -> Params:
# try transformer naming first
n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
@ -192,7 +187,6 @@ class Params:
return Params(
n_vocab = n_vocab,
n_embd = n_embd,
n_mult = n_mult,
n_layer = n_layer,
n_ctx = -1,
n_ff = n_ff,
@ -202,7 +196,7 @@ class Params:
)
@staticmethod
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path))
n_vocab = config["vocab_size"]
@ -220,8 +214,6 @@ class Params:
else:
f_rope_scale = None
n_mult = Params.find_n_mult(n_ff, n_embd)
if "max_sequence_length" in config:
n_ctx = config["max_sequence_length"]
elif "max_position_embeddings" in config:
@ -233,7 +225,6 @@ class Params:
return Params(
n_vocab = n_vocab,
n_embd = n_embd,
n_mult = n_mult,
n_layer = n_layer,
n_ctx = n_ctx,
n_ff = n_ff,
@ -245,15 +236,14 @@ class Params:
)
# LLaMA v2 70B params.json
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
@staticmethod
def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path))
n_vocab = config["vocab_size"] if "vocab_size" in config else -1
n_embd = config["dim"]
n_layer = config["n_layers"]
n_mult = config["multiple_of"]
n_ff = -1
n_head = config["n_heads"]
n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
@ -261,7 +251,7 @@ class Params:
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
# hack to determine LLaMA v1 vs v2 vs CodeLlama
if f_rope_freq_base and f_rope_freq_base == 1000000:
if f_rope_freq_base == 1000000:
# CodeLlama
n_ctx = 16384
elif config["norm_eps"] == 1e-05:
@ -280,7 +270,6 @@ class Params:
return Params(
n_vocab = n_vocab,
n_embd = n_embd,
n_mult = n_mult,
n_layer = n_layer,
n_ctx = n_ctx,
n_ff = n_ff,
@ -291,7 +280,7 @@ class Params:
)
@staticmethod
def load(model_plus: 'ModelPlus') -> 'Params':
def load(model_plus: ModelPlus) -> Params:
hf_config_path = model_plus.paths[0].parent / "config.json"
orig_config_path = model_plus.paths[0].parent / "params.json"
@ -314,19 +303,31 @@ class Params:
#
class BpeVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
added_tokens: Dict[str, int]
added_tokens: dict[str, int]
if fname_added_tokens is not None:
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else:
added_tokens = {}
# Fall back to trying to find the added tokens in tokenizer.json
tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
if not tokenizer_json_file.is_file():
added_tokens = {}
else:
tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
added_tokens = dict(
(item['content'], item['id'])
for item in tokenizer_json.get('added_tokens', [])
# Added tokens here can be duplicates of the main vocabulary.
if item['content'] not in self.bpe_tokenizer )
vocab_size: int = len(self.bpe_tokenizer)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids:
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
expected_end_id = vocab_size + len(actual_ids) - 1
raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_list = [text for (text, idx) in items]
@ -335,22 +336,34 @@ class BpeVocab:
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
def bpe_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
tokenizer = self.bpe_tokenizer
from transformers.models.gpt2 import tokenization_gpt2
from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import]
byte_encoder = tokenization_gpt2.bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}
score = 0.0
for i, item in enumerate(tokenizer):
text: bytes = item.encode("utf-8")
score: float = -i
yield text, score, gguf.TokenType.USER_DEFINED
# FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior?
if i <= 258 and text.startswith(b'<') and text.endswith(b'>'):
if i == 0 and text == b'<unk>':
toktype = gguf.TokenType.UNKNOWN
elif i == 1 or i == 2:
toktype = gguf.TokenType.CONTROL
elif i >= 3 and text.startswith(b'<0x'):
toktype = gguf.TokenType.BYTE
else:
toktype = gguf.TokenType.NORMAL
else:
toktype = gguf.TokenType.NORMAL
yield text, score, toktype
def added_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list:
score = -1000.0
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
def all_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
yield from self.bpe_tokens()
yield from self.added_tokens()
@ -359,9 +372,9 @@ class BpeVocab:
class SentencePieceVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
added_tokens: Dict[str, int]
added_tokens: dict[str, int]
if fname_added_tokens is not None:
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else:
@ -380,7 +393,7 @@ class SentencePieceVocab:
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
tokenizer = self.sentencepiece_tokenizer
for i in range(tokenizer.vocab_size()):
piece = tokenizer.id_to_piece(i)
@ -404,19 +417,19 @@ class SentencePieceVocab:
yield text, score, toktype
def added_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list:
score = -1000.0
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
def all_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
yield from self.sentencepiece_tokens()
yield from self.added_tokens()
def __repr__(self) -> str:
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
Vocab = Union[BpeVocab, SentencePieceVocab]
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
#
# data loading
@ -436,15 +449,15 @@ class Tensor(metaclass=ABCMeta):
data_type: DataType
@abstractmethod
def astype(self, data_type: DataType) -> 'Tensor': ...
def astype(self, data_type: DataType) -> Tensor: ...
@abstractmethod
def permute(self, n_head: int, n_head_kv: int) -> 'Tensor': ...
def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
@abstractmethod
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> 'UnquantizedTensor': ...
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
@abstractmethod
def part(self, n_part: int) -> 'UnquantizedTensor': ...
def part(self, n_part: int) -> UnquantizedTensor: ...
@abstractmethod
def to_ggml(self) -> 'GGMLCompatibleTensor': ...
def to_ggml(self) -> GGMLCompatibleTensor: ...
def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
@ -465,22 +478,22 @@ class UnquantizedTensor(Tensor):
self.ndarray = bf16_to_fp32(self.ndarray)
return UnquantizedTensor(self.ndarray.astype(dtype))
def to_ggml(self) -> 'UnquantizedTensor':
def to_ggml(self) -> UnquantizedTensor:
return self
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> 'UnquantizedTensor':
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
r = self.ndarray.shape[0] // 3
return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
def part(self, n_part: int) -> 'UnquantizedTensor':
def part(self, n_part: int) -> UnquantizedTensor:
r = self.ndarray.shape[0] // 3
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
def permute(self, n_head: int, n_head_kv: int) -> 'UnquantizedTensor':
def permute(self, n_head: int, n_head_kv: int) -> UnquantizedTensor:
return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv))
def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
def load_unquantized(lazy_tensor: LazyTensor, expected_dtype: Any = None, convert: bool = False) -> NDArray:
tensor = lazy_tensor.load()
assert isinstance(tensor, UnquantizedTensor)
@ -496,13 +509,13 @@ def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, conv
return tensor.ndarray
GGMLCompatibleTensor = Union[UnquantizedTensor]
GGMLCompatibleTensor = UnquantizedTensor
@dataclass
class LazyTensor:
_load: Callable[[], Tensor]
shape: List[int]
shape: list[int]
data_type: DataType
description: str
@ -513,7 +526,7 @@ class LazyTensor:
(self.data_type, ret.data_type, self.description)
return ret
def astype(self, data_type: DataType) -> 'LazyTensor':
def astype(self, data_type: DataType) -> LazyTensor:
self.validate_conversion_to(data_type)
def load() -> Tensor:
@ -525,24 +538,24 @@ class LazyTensor:
raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.')
LazyModel = Dict[str, LazyTensor]
LazyModel: TypeAlias = 'dict[str, LazyTensor]'
@dataclass
class ModelPlus:
model: LazyModel
paths: List[Path] # Where this was read from.
paths: list[Path] # Where this was read from.
format: Literal['ggml', 'torch', 'safetensors', 'none']
vocab: Optional[Vocab] # For GGML models (which have vocab built in), the vocab.
vocab: Vocab | None # For GGML models (which have vocab built in), the vocab.
def merge_sharded(models: List[LazyModel]) -> LazyModel:
def merge_sharded(models: list[LazyModel]) -> LazyModel:
# Original LLaMA models have each file contain one part of each tensor.
# Use a dict instead of a set to preserve order.
names = {name: None for model in models for name in model}
def convert(name: str) -> LazyTensor:
lazy_tensors: List[LazyTensor] = [model[name] for model in models]
lazy_tensors: list[LazyTensor] = [model[name] for model in models]
if len(lazy_tensors) == 1:
# only one file; don't go through this procedure since there might
# be quantized tensors
@ -570,7 +583,7 @@ def merge_sharded(models: List[LazyModel]) -> LazyModel:
return {name: convert(name) for name in names}
def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
formats = set(mp.format for mp in models_plus)
assert len(formats) == 1, "different formats?"
format = formats.pop()
@ -644,7 +657,7 @@ class LazyUnpickler(pickle.Unpickler):
assert isinstance(pid[1], LazyStorageKind)
data_type = pid[1].data_type
filename_stem = pid[2]
filename = self.data_base_path + '/' + filename_stem
filename = f'{self.data_base_path}/{filename_stem}'
info = self.zip_file.getinfo(filename)
def load(offset: int, elm_count: int) -> NDArray:
@ -660,7 +673,6 @@ class LazyUnpickler(pickle.Unpickler):
@staticmethod
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
# pyright: ignore[reportSelfClsParameterName]
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
assert isinstance(storage, LazyStorage)
@ -674,7 +686,7 @@ class LazyUnpickler(pickle.Unpickler):
def rebuild_from_type_v2(func, new_type, args, state):
return func(*args)
CLASSES: Dict[Tuple[str, str], Any] = {
CLASSES: dict[tuple[str, str], Any] = {
# getattr used here as a workaround for mypy not being smart enough to detrmine
# the staticmethods have a __func__ attribute.
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
@ -707,15 +719,15 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
header_size, = struct.unpack('<Q', fp.read(8))
header: Dict[str, Dict[str, Any]] = json.loads(fp.read(header_size))
header: dict[str, dict[str, Any]] = json.loads(fp.read(header_size))
# Use mmap for the actual data to avoid race conditions with the file offset.
mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
byte_buf = mapped[8 + header_size:]
def convert(info: Dict[str, Any]) -> LazyTensor:
def convert(info: dict[str, Any]) -> LazyTensor:
data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
numpy_dtype = data_type.dtype
shape: List[int] = info['shape']
shape: list[int] = info['shape']
begin, end = info['data_offsets']
assert 0 <= begin <= end <= len(byte_buf)
assert end - begin == math.prod(shape) * numpy_dtype.itemsize
@ -754,7 +766,7 @@ def lazy_load_file(path: Path) -> ModelPlus:
In = TypeVar('In')
Out = TypeVar('Out')
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: Optional[int] = None, use_processpool_executor: bool = False) -> Iterable[Out]:
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
'''Parallel map, but with backpressure. If the caller doesn't call `next`
fast enough, this will stop calling `func` at some point rather than
letting results pile up in memory. Specifically, there is a max of one
@ -763,13 +775,13 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
yield from map(func, iterable)
# Not reached.
iterable = iter(iterable)
executor_class: Union[Type[ThreadPoolExecutor], Type[ProcessPoolExecutor]]
executor_class: type[ThreadPoolExecutor] | type[ProcessPoolExecutor]
if use_processpool_executor:
executor_class = ProcessPoolExecutor
else:
executor_class = ThreadPoolExecutor
with executor_class(max_workers = max_workers) as executor:
futures: List[concurrent.futures.Future[Out]] = []
futures: list[concurrent.futures.Future[Out]] = []
done = False
for _ in range(concurrency):
try:
@ -813,9 +825,9 @@ class OutputFile:
name = "LLaMA"
# TODO: better logic to determine model name
if (params.n_ctx == 4096):
if params.n_ctx == 4096:
name = "LLaMA v2"
elif params.path_model:
elif params.path_model is not None:
name = str(params.path_model.parent).split('/')[-1]
self.gguf.add_name (name)
@ -828,13 +840,13 @@ class OutputFile:
self.gguf.add_head_count_kv (params.n_head_kv)
self.gguf.add_layer_norm_rms_eps (params.f_norm_eps)
if params.f_rope_freq_base:
if params.f_rope_freq_base is not None:
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
if params.f_rope_scale:
if params.f_rope_scale is not None:
self.gguf.add_rope_scale_linear(params.f_rope_scale)
if params.ftype:
if params.ftype is not None:
self.gguf.add_file_type(params.ftype)
def add_meta_vocab(self, vocab: Vocab) -> None:
@ -893,13 +905,13 @@ class OutputFile:
of.close()
@staticmethod
def do_item(item: Tuple[str, LazyTensor]) -> Tuple[DataType, NDArray]:
def do_item(item: tuple[str, LazyTensor]) -> tuple[DataType, NDArray]:
name, lazy_tensor = item
tensor = lazy_tensor.load().to_ggml()
return (lazy_tensor.data_type, tensor.ndarray)
@staticmethod
def maybe_do_quantize(item: Tuple[DataType, NDArray]) -> NDArray:
def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
dt, arr = item
if not isinstance(dt, QuantizedDataType):
return arr
@ -940,7 +952,7 @@ class OutputFile:
of.close()
def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType:
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
wq_type = model[NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
@ -960,7 +972,7 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM
def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
should_skip: Set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
tmp = model
@ -995,12 +1007,12 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
return out
def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
def nth_multifile_path(path: Path, n: int) -> Path | None:
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
the nth path in the model.
'''
# Support the following patterns:
patterns: List[Tuple[str, str]] = [
patterns: list[tuple[str, str]] = [
# - x.00.pth, x.01.pth, etc.
(r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
# - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
@ -1016,11 +1028,11 @@ def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
return None
def find_multifile_paths(path: Path) -> List[Path]:
def find_multifile_paths(path: Path) -> list[Path]:
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
the whole list of paths in the model.
'''
ret: List[Path] = []
ret: list[Path] = []
for i in itertools.count():
nth_path = nth_multifile_path(path, i)
if nth_path is None:
@ -1051,7 +1063,7 @@ def load_some_model(path: Path) -> ModelPlus:
path = files[0]
paths = find_multifile_paths(path)
models_plus: List[ModelPlus] = []
models_plus: list[ModelPlus] = []
for path in paths:
print(f"Loading model file {path}")
models_plus.append(lazy_load_file(path))
@ -1060,7 +1072,7 @@ def load_some_model(path: Path) -> ModelPlus:
return model_plus
def load_vocab(path: Path, vocabtype: Optional[str]) -> Union[BpeVocab, SentencePieceVocab]:
def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
# Be extra-friendly and accept either a file or a directory. Also, if it's
# a directory, it might be the model directory, and tokenizer.model might
# be in the parent of that.
@ -1091,7 +1103,7 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> Union[BpeVocab, Sentence
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
namestr = {
GGMLFileType.AllF32: "f32",
GGMLFileType.MostlyF16: "f16",
@ -1114,7 +1126,7 @@ def do_dump_model(model_plus: ModelPlus) -> None:
print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
def main(args_in: Optional[List[str]] = None) -> None:
def main(args_in: list[str] | None = None) -> None:
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")

View file

@ -23,6 +23,7 @@ else()
add_subdirectory(train-text-from-scratch)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(simple)
add_subdirectory(speculative)
add_subdirectory(embd-input)
add_subdirectory(llama-bench)
add_subdirectory(beam-search)

View file

@ -1617,15 +1617,10 @@ int main(int argc, char ** argv) {
float error_before_opt = ggml_get_f32_1d(e, 0);
struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
opt_params_adam.print_forward_graph = false;
opt_params_adam.print_backward_graph = false;
opt_params_lbfgs.print_forward_graph = false;
opt_params_lbfgs.print_backward_graph = false;
opt_params_adam.adam.n_iter = 16;
opt_params_lbfgs.lbfgs.n_iter = 16;
// ggml_opt(ctx0, opt_params_adam, e);
ggml_opt(ctx0, opt_params_lbfgs, e);
//
ggml_build_forward_expand(&gf, e);

View file

@ -1,7 +1,3 @@
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "common.h"
#include "llama.h"
#include "build-info.h"
@ -22,7 +18,9 @@
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h>
#include <signal.h>
#endif
@ -73,7 +71,7 @@ void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_stat
assert(0u < beams_state.n_beams);
const llama_token * tokens = beams_state.beam_views[0].tokens;
std::copy(tokens, tokens + n, callback_data.response.end() - n);
printf("%lu", n);
printf("%zu", n);
}
fflush(stdout);
#if 1 // DEBUG: print current beams for this iteration
@ -145,7 +143,7 @@ int main(int argc, char ** argv)
if (tokens_list.size() > max_tokens_list_size)
{
fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" ,
fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
__func__ , tokens_list.size() , max_tokens_list_size );
return 1;
}

View file

@ -1,5 +1,6 @@
#include "ggml.h"
#include "llama.h"
#include "common.h"
#include <unordered_map>
#include <vector>
@ -75,7 +76,7 @@ typedef struct {
int seq_len; // max sequence length
} Config;
typedef struct {
struct TransformerWeights {
// token embedding table
float* token_embedding_table; // (vocab_size, dim)
// weights for rmsnorms
@ -97,7 +98,22 @@ typedef struct {
// float* freq_cis_imag; // (seq_len, dim/2)
// (optional) classifier weights for the logits, on the last layer
float* wcls;
} TransformerWeights;
~TransformerWeights() {
delete[] token_embedding_table;
delete[] rms_att_weight;
delete[] rms_ffn_weight;
delete[] wq;
delete[] wk;
delete[] wv;
delete[] wo;
delete[] w1;
delete[] w2;
delete[] w3;
delete[] rms_final_weight;
delete[] wcls;
}
};
void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
// we calloc instead of malloc to keep valgrind happy
@ -173,21 +189,6 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar
return 0;
}
void free_weights(TransformerWeights* w) {
delete w->token_embedding_table;
delete w->rms_att_weight;
delete w->rms_ffn_weight;
delete w->wq;
delete w->wk;
delete w->wv;
delete w->wo;
delete w->w1;
delete w->w2;
delete w->w3;
delete w->rms_final_weight;
if (w->wcls) delete w->wcls;
}
void print_sample_weights(TransformerWeights *w){
printf("----- Quick print of first of the weight vales of all the variables\n");
printf("%f\n", w->token_embedding_table[0]);
@ -499,10 +500,10 @@ struct llama_file {
errno = 0;
std::size_t ret = std::fread(ptr, size, 1, fp);
if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno)));
die_fmt("fread failed: %s", strerror(errno));
}
if (ret != 1) {
throw std::runtime_error(std::string("unexpectedly reached end of file"));
die("unexpectedly reached end of file");
}
}
@ -596,6 +597,9 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
// assume llama2.c vocabulary
printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
llama_file file(filename, "rb");
if (!file.fp) {
die_fmt("%s: %s", strerror(errno), filename);
}
const int n_vocab = config->vocab_size;
/* uint32_t max_token_length = */ file.read_u32(); // unused
vocab->id_to_token.resize(n_vocab);
@ -633,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
}
}
void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){
void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
int ct;
switch (gg_weights->n_dims){
case 1:
@ -670,13 +674,13 @@ void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * kar
}
void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
// stuff AK weights into GG weights one by one.
// convert AK weights into GG weights one by one.
// w->token_embedding_table -> model->tok_embeddings
// float* -> struct ggml_tensor
stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
//print_row(model->norm, 0);
// for rms-att-weight
@ -686,18 +690,18 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
auto & layer = model->layers[i];
// 1d
stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
convert_weights_ak_to_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
convert_weights_ak_to_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
// from 3d matrix layer x dim x dim to 2d matrix dim x dim
stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]);
convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length]);
convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length]);
convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]);
stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
convert_weights_ak_to_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
}
struct gguf_context * ctx = gguf_init_empty();
@ -898,7 +902,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
}
std::string basename(const std::string &path) {
size_t pos = path.find_last_of("/");
size_t pos = path.find_last_of("/\\");
if (pos == std::string::npos) {
return path;
}
@ -911,7 +915,7 @@ int main(int argc, char ** argv) {
return 1;
}
Config config;
TransformerWeights weights;
TransformerWeights weights = {};
{
FILE *file = fopen(params.fn_llama2c_model, "rb");
if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
@ -953,6 +957,5 @@ int main(int argc, char ** argv) {
printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
ggml_free(model.ctx);
free_weights(&weights);
return 0;
}

View file

@ -1,8 +1,3 @@
// Defines sigaction on msys:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "embd-input.h"
#include <cassert>
@ -23,7 +18,7 @@ extern "C" {
struct MyModel* create_mymodel(int argc, char ** argv) {
gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) {
if (!gpt_params_parse(argc, argv, params)) {
return nullptr;
}

View file

@ -11,17 +11,12 @@
int main(int argc, char ** argv) {
gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) {
if (!gpt_params_parse(argc, argv, params)) {
return 1;
}
params.embedding = true;
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed == LLAMA_DEFAULT_SEED) {
@ -47,6 +42,12 @@ int main(int argc, char ** argv) {
return 1;
}
const int n_ctx_train = llama_n_ctx_train(ctx);
if (params.n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
}
// print system information
{
fprintf(stderr, "\n");

View file

@ -76,7 +76,7 @@ bool gguf_ex_write(const std::string & fname) {
gguf_write_to_file(ctx, fname.c_str(), false);
fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
printf("%s: wrote file '%s;\n", __func__, fname.c_str());
ggml_free(ctx_data);
gguf_free(ctx);
@ -93,20 +93,20 @@ bool gguf_ex_read_0(const std::string & fname) {
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
fprintf(stdout, "%s: version: %d\n", __func__, gguf_get_version(ctx));
fprintf(stdout, "%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
// kv
{
const int n_kv = gguf_get_n_kv(ctx);
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ctx, i);
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
printf("%s: kv[%d]: key = %s\n", __func__, i, key);
}
}
@ -116,10 +116,10 @@ bool gguf_ex_read_0(const std::string & fname) {
const int keyidx = gguf_find_key(ctx, findkey);
if (keyidx == -1) {
fprintf(stdout, "%s: find key: %s not found.\n", __func__, findkey);
printf("%s: find key: %s not found.\n", __func__, findkey);
} else {
const char * key_value = gguf_get_val_str(ctx, keyidx);
fprintf(stdout, "%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
printf("%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
}
}
@ -127,13 +127,13 @@ bool gguf_ex_read_0(const std::string & fname) {
{
const int n_tensors = gguf_get_n_tensors(ctx);
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ctx, i);
const size_t offset = gguf_get_tensor_offset(ctx, i);
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
}
}
@ -153,20 +153,20 @@ bool gguf_ex_read_1(const std::string & fname) {
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
fprintf(stdout, "%s: version: %d\n", __func__, gguf_get_version(ctx));
fprintf(stdout, "%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
// kv
{
const int n_kv = gguf_get_n_kv(ctx);
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ctx, i);
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
printf("%s: kv[%d]: key = %s\n", __func__, i, key);
}
}
@ -174,13 +174,13 @@ bool gguf_ex_read_1(const std::string & fname) {
{
const int n_tensors = gguf_get_n_tensors(ctx);
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ctx, i);
const size_t offset = gguf_get_tensor_offset(ctx, i);
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
}
}
@ -189,13 +189,13 @@ bool gguf_ex_read_1(const std::string & fname) {
const int n_tensors = gguf_get_n_tensors(ctx);
for (int i = 0; i < n_tensors; ++i) {
fprintf(stdout, "%s: reading tensor %d data\n", __func__, i);
printf("%s: reading tensor %d data\n", __func__, i);
const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
// print first 10 elements
const float * data = (const float *) cur->data;
@ -219,7 +219,7 @@ bool gguf_ex_read_1(const std::string & fname) {
}
}
fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
printf("%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
ggml_free(ctx_data);
gguf_free(ctx);
@ -229,7 +229,7 @@ bool gguf_ex_read_1(const std::string & fname) {
int main(int argc, char ** argv) {
if (argc < 3) {
fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
printf("usage: %s data.gguf r|w\n", argv[0]);
return -1;
}

View file

@ -305,9 +305,9 @@ struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name)
struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
if( cur == NULL ) {
fprintf(stdout, "%s: tensor '%s' not found!\n", __func__, name.c_str());
printf("%s: tensor '%s' not found!\n", __func__, name.c_str());
} else {
// fprintf(stdout, "%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
// printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
}
return cur;
@ -333,21 +333,21 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
return false;
}
fprintf(stdout, "%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx));
fprintf(stdout, "%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx));
fprintf(stdout, "%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
printf("%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx));
printf("%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx));
printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
// print all kv
#if 0
{
const int n_kv = gguf_get_n_kv(ggufctx);
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ggufctx, i);
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
printf("%s: kv[%d]: key = %s\n", __func__, i, key);
}
}
#endif
@ -357,21 +357,21 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
int keyidx;
keyidx = gguf_find_key(ggufctx, "general.name");
if (keyidx != -1) { fprintf(stdout, "%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
if (keyidx != -1) { printf("%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.description");
if (keyidx != -1) { fprintf(stdout, "%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
if (keyidx != -1) { printf("%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.author");
if (keyidx != -1) { fprintf(stdout, "%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
if (keyidx != -1) { printf("%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.license");
if (keyidx != -1) { fprintf(stdout, "%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
if (keyidx != -1) { printf("%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.architecture");
if (keyidx != -1) { fprintf(stdout, "%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.file_type");
if (keyidx != -1) { fprintf(stdout, "%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
if (keyidx != -1) { fprintf(stdout, "%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
}
// check required metadata
@ -382,11 +382,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
keyidx = gguf_find_key(ggufctx, "general.architecture");
if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "falcon") != 0) {
fprintf(stdout, "%s: model architecture not supported!\n", __func__);
printf("%s: model architecture not supported!\n", __func__);
return false;
}
} else {
fprintf(stdout, "%s: gguf model architecture not found!\n", __func__);
printf("%s: gguf model architecture not found!\n", __func__);
return false;
}
@ -394,11 +394,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
keyidx = gguf_find_key(ggufctx, "falcon.tensor_data_layout");
if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "jploski") != 0) {
fprintf(stdout, "%s: model tensor data layout not supported!\n", __func__);
printf("%s: model tensor data layout not supported!\n", __func__);
return false;
}
} else {
fprintf(stdout, "%s: gguf model tensor data layout not found!\n", __func__);
printf("%s: gguf model tensor data layout not found!\n", __func__);
return false;
}
@ -455,11 +455,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
fprintf(stdout, "%s: tokenizer model not supported!\n", __func__);
printf("%s: tokenizer model not supported!\n", __func__);
return false;
}
} else {
fprintf(stdout, "%s: tokenizer model not found!\n", __func__);
printf("%s: tokenizer model not found!\n", __func__);
return false;
}
@ -467,22 +467,22 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
if (tokens_keyidx == -1) {
fprintf(stdout, "%s: gpt2 tokenizer vocab not found!\n", __func__);
printf("%s: gpt2 tokenizer vocab not found!\n", __func__);
return false;
}
int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");
if (merges_keyidx == -1) {
fprintf(stdout, "%s: gpt2 tokenizer merges not found!\n", __func__);
printf("%s: gpt2 tokenizer merges not found!\n", __func__);
return false;
}
hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);
fprintf(stdout, "%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab);
fprintf(stdout, "%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
printf("%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab);
printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
for (size_t i = 0; i < hparams.n_vocab; i++) {
std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
@ -523,12 +523,12 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
if( vocab.special_bos_id != -1 ) { fprintf(stdout, "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
if( vocab.special_eos_id != -1 ) { fprintf(stdout, "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
if( vocab.special_unk_id != -1 ) { fprintf(stdout, "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
if( vocab.special_sep_id != -1 ) { fprintf(stdout, "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
if( vocab.special_pad_id != -1 ) { fprintf(stdout, "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
if( vocab.linefeed_id != -1 ) { fprintf(stdout, "%s: LF token = %d\n", __func__, vocab.linefeed_id ); }
if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
if( vocab.linefeed_id != -1 ) { printf("%s: LF token = %d\n", __func__, vocab.linefeed_id ); }
}
@ -543,13 +543,13 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
{
const int n_tensors = gguf_get_n_tensors(ggufctx);
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ggufctx, i);
const size_t offset = gguf_get_tensor_offset(ggufctx, i);
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
}
}
#endif
@ -953,7 +953,7 @@ int main(int argc, char ** argv) {
gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) {
if (!gpt_params_parse(argc, argv, params)) {
return 1;
}

View file

@ -318,9 +318,9 @@ struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name)
struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
if( cur == NULL ) {
fprintf(stdout, "%s: tensor '%s' not found!\n", __func__, name.c_str());
printf("%s: tensor '%s' not found!\n", __func__, name.c_str());
} else {
// fprintf(stdout, "%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
// printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
}
return cur;
@ -346,21 +346,21 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
return false;
}
fprintf(stdout, "%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx));
fprintf(stdout, "%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx));
fprintf(stdout, "%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
printf("%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx));
printf("%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx));
printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
// print all kv
#if 0
{
const int n_kv = gguf_get_n_kv(ggufctx);
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ggufctx, i);
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
printf("%s: kv[%d]: key = %s\n", __func__, i, key);
}
}
#endif
@ -370,21 +370,21 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
int keyidx;
keyidx = gguf_find_key(ggufctx, "general.name");
if (keyidx != -1) { fprintf(stdout, "%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
if (keyidx != -1) { printf("%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.description");
if (keyidx != -1) { fprintf(stdout, "%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
if (keyidx != -1) { printf("%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.author");
if (keyidx != -1) { fprintf(stdout, "%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
if (keyidx != -1) { printf("%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.license");
if (keyidx != -1) { fprintf(stdout, "%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
if (keyidx != -1) { printf("%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.architecture");
if (keyidx != -1) { fprintf(stdout, "%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.file_type");
if (keyidx != -1) { fprintf(stdout, "%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
if (keyidx != -1) { fprintf(stdout, "%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
}
// check required metadata
@ -395,11 +395,11 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
keyidx = gguf_find_key(ggufctx, "general.architecture");
if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) {
fprintf(stdout, "%s: model architecture not supported!\n", __func__);
printf("%s: model architecture not supported!\n", __func__);
return false;
}
} else {
fprintf(stdout, "%s: gguf model architecture not found!\n", __func__);
printf("%s: gguf model architecture not found!\n", __func__);
return false;
}
@ -456,11 +456,11 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
fprintf(stdout, "%s: tokenizer model not supported!\n", __func__);
printf("%s: tokenizer model not supported!\n", __func__);
return false;
}
} else {
fprintf(stdout, "%s: tokenizer model not found!\n", __func__);
printf("%s: tokenizer model not found!\n", __func__);
return false;
}
@ -468,22 +468,22 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
if (tokens_keyidx == -1) {
fprintf(stdout, "%s: gpt2 tokenizer vocab not found!\n", __func__);
printf("%s: gpt2 tokenizer vocab not found!\n", __func__);
return false;
}
int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");
if (merges_keyidx == -1) {
fprintf(stdout, "%s: gpt2 tokenizer merges not found!\n", __func__);
printf("%s: gpt2 tokenizer merges not found!\n", __func__);
return false;
}
hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);
fprintf(stdout, "%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab);
fprintf(stdout, "%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
printf("%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab);
printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
for (size_t i = 0; i < hparams.n_vocab; i++) {
std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
@ -524,12 +524,12 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
if( vocab.special_bos_id != -1 ) { fprintf(stdout, "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
if( vocab.special_eos_id != -1 ) { fprintf(stdout, "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
if( vocab.special_unk_id != -1 ) { fprintf(stdout, "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
if( vocab.special_sep_id != -1 ) { fprintf(stdout, "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
if( vocab.special_pad_id != -1 ) { fprintf(stdout, "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
if( vocab.linefeed_id != -1 ) { fprintf(stdout, "%s: LF token = %d\n", __func__, vocab.linefeed_id ); }
if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
if( vocab.linefeed_id != -1 ) { printf("%s: LF token = %d\n", __func__, vocab.linefeed_id ); }
}
@ -543,13 +543,13 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
{
const int n_tensors = gguf_get_n_tensors(ggufctx);
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ggufctx, i);
const size_t offset = gguf_get_tensor_offset(ggufctx, i);
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
}
}
#endif
@ -660,9 +660,10 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
ggml_tensor * gpt_neox_ff(
const gpt_neox_block &block,
ggml_context * ctx0,
ggml_tensor * inp) {
ggml_tensor * inp,
const gpt_neox_hparams &hparams) {
ggml_tensor * cur = ggml_norm(ctx0, inp);
ggml_tensor * cur = ggml_norm(ctx0, inp, hparams.norm_eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur));
cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur);
@ -753,7 +754,7 @@ bool gpt_neox_eval(
// self-attention
{
{
cur = ggml_norm(ctx0, inpL);
cur = ggml_norm(ctx0, inpL, hparams.norm_eps);
cur = ggml_add(ctx0,
ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur),
@ -844,7 +845,7 @@ bool gpt_neox_eval(
if (hparams.par_res == 0) {
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF);
cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF, hparams);
// input for next layer
inpL = ggml_add(ctx0, cur, inpFF);
@ -853,7 +854,7 @@ bool gpt_neox_eval(
// this is independent of the self-attention result, so it could be done in parallel to the self-attention
// note here we pass inpL instead of cur
cur = gpt_neox_ff(model.blocks[il], ctx0, inpL);
cur = gpt_neox_ff(model.blocks[il], ctx0, inpL, hparams);
// layer input + FF
cur = ggml_add(ctx0, cur, inpFF);
@ -867,7 +868,7 @@ bool gpt_neox_eval(
// norm
{
inpL = ggml_norm(ctx0, inpL);
inpL = ggml_norm(ctx0, inpL, hparams.norm_eps);
// inpL = ln_f_g*inpL + ln_f_b
inpL = ggml_add(ctx0,
@ -924,7 +925,7 @@ int main(int argc, char ** argv) {
gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) {
if (!gpt_params_parse(argc, argv, params)) {
return 1;
}

47
examples/llama-bench/llama-bench.cpp Executable file → Normal file
View file

@ -165,26 +165,26 @@ static const cmd_params cmd_params_defaults = {
};
static void print_usage(int /* argc */, char ** argv) {
fprintf(stdout, "usage: %s [options]\n", argv[0]);
fprintf(stdout, "\n");
fprintf(stdout, "options:\n");
fprintf(stdout, " -h, --help\n");
fprintf(stdout, " -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
fprintf(stdout, " -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
fprintf(stdout, " -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
fprintf(stdout, " -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
fprintf(stdout, " --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
fprintf(stdout, " -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
fprintf(stdout, " -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
fprintf(stdout, " -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
fprintf(stdout, " -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
fprintf(stdout, " -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
fprintf(stdout, " -ts, --tensor_split <ts0/ts1/..> \n");
fprintf(stdout, " -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
fprintf(stdout, " -o, --output <csv|json|md|sql> (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
fprintf(stdout, " -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
fprintf(stdout, "\n");
fprintf(stdout, "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
printf("usage: %s [options]\n", argv[0]);
printf("\n");
printf("options:\n");
printf(" -h, --help\n");
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
printf(" -ts, --tensor_split <ts0/ts1/..> \n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf("\n");
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
}
@ -986,7 +986,12 @@ int main(int argc, char ** argv) {
test t(inst, lmodel, ctx);
// warmup run
test_gen(ctx, 1, 0, t.n_threads);
if (t.n_prompt > 0) {
test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads);
}
if (t.n_gen > 0) {
test_gen(ctx, 1, 0, t.n_threads);
}
for (int i = 0; i < params.reps; i++) {
uint64_t t_start = get_time_ns();

View file

@ -34,7 +34,7 @@ For an interactive experience, try this command:
#### Unix-based systems (Linux, macOS, etc.):
```bash
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
'User: Hi
AI: Hello. I am an AI chatbot. Would you like to talk?
User: Sure!
@ -45,7 +45,7 @@ User:'
#### Windows:
```powershell
main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
```
The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):

View file

@ -1,8 +1,3 @@
// Defines sigaction on msys:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "common.h"
#include "console.h"
@ -48,8 +43,9 @@ static bool is_interacting = false;
void write_logfile(
const llama_context * ctx, const gpt_params & params, const llama_model * model,
const std::vector<llama_token> input_tokens, const std::string output, const std::vector<llama_token> output_tokens) {
const std::vector<llama_token> & input_tokens, const std::string & output,
const std::vector<llama_token> & output_tokens
) {
if (params.logdir.empty()) {
return;
}
@ -109,14 +105,14 @@ int main(int argc, char ** argv) {
gpt_params params;
g_params = &params;
if (gpt_params_parse(argc, argv, params) == false) {
if (!gpt_params_parse(argc, argv, params)) {
return 1;
}
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("main", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc,argv);
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
// TODO: Dump params ?
@ -151,14 +147,6 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
}
if (params.n_ctx > 2048) {
// TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
LOG_TEE("%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
} else if (params.n_ctx < 8) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed == LLAMA_DEFAULT_SEED) {
@ -194,6 +182,15 @@ int main(int argc, char ** argv) {
return 1;
}
const int n_ctx_train = llama_n_ctx_train(ctx);
if (params.n_ctx > n_ctx_train) {
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
} else if (params.n_ctx < 8) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
// print system information
{
LOG_TEE("\n");
@ -304,7 +301,7 @@ int main(int argc, char ** argv) {
// debug message about similarity of saved session, if applicable
size_t n_matching_session_tokens = 0;
if (session_tokens.size() > 0) {
if (!session_tokens.empty()) {
for (llama_token id : session_tokens) {
if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
break;
@ -402,7 +399,7 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: interactive mode on.\n", __func__);
if (params.antiprompt.size()) {
if (!params.antiprompt.empty()) {
for (const auto & antiprompt : params.antiprompt) {
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
}
@ -425,8 +422,9 @@ int main(int argc, char ** argv) {
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("\n\n");
struct llama_grammar * grammar = NULL;
grammar_parser::parse_state parsed_grammar;
llama_grammar * grammar = NULL;
if (!params.grammar.empty()) {
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
// will be empty (default) if there are parse errors
@ -450,8 +448,8 @@ int main(int argc, char ** argv) {
}
// TODO: replace with ring-buffer
std::vector<llama_token> last_n_tokens(n_ctx);
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
std::vector<llama_token> last_tokens(n_ctx);
std::fill(last_tokens.begin(), last_tokens.end(), 0);
if (params.interactive) {
const char *control_message;
@ -492,17 +490,14 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance;
{
LOG("warming up the model with an empty run\n");
const int n_vocab = llama_n_vocab(ctx);
const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
llama_reset_timings(ctx);
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict
if (embd.size() > 0) {
if (!embd.empty()) {
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
// --prompt or --file which uses the same value.
int max_embd_size = n_ctx - 4;
@ -537,8 +532,8 @@ int main(int argc, char ** argv) {
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
// insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
// insert n_left/2 tokens at the start of embd from last_tokens
embd.insert(embd.begin(), last_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_tokens.end() - embd.size());
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
@ -627,7 +622,7 @@ int main(int argc, char ** argv) {
LOG("n_past = %d\n", n_past);
}
if (embd.size() > 0 && !path_session.empty()) {
if (!embd.empty() && !path_session.empty()) {
session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
n_session_consumed = session_tokens.size();
}
@ -637,20 +632,6 @@ int main(int argc, char ** argv) {
embd_guidance.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
const float top_p = params.top_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
const float repeat_penalty = params.repeat_penalty;
const float alpha_presence = params.presence_penalty;
const float alpha_frequency = params.frequency_penalty;
const int mirostat = params.mirostat;
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
const bool penalize_nl = params.penalize_nl;
// optionally save the session on first sample (for faster prompt loading next time)
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
need_to_save_session = false;
@ -659,98 +640,12 @@ int main(int argc, char ** argv) {
LOG("saved session to %s\n", path_session.c_str());
}
llama_token id = 0;
const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
{
auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(ctx);
last_tokens.erase(last_tokens.begin());
last_tokens.push_back(id);
// Apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
if (ctx_guidance) {
llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
}
// Apply penalties
float nl_logit = logits[llama_token_nl(ctx)];
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
llama_sample_repetition_penalty(ctx, &cur_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
last_n_repeat, repeat_penalty);
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
last_n_repeat, alpha_frequency, alpha_presence);
if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
cur_p.data[idx].logit = nl_logit;
break;
}
}
}
if (grammar != NULL) {
llama_sample_grammar(ctx, &cur_p, grammar);
}
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx, &cur_p);
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temperature(ctx, &cur_p, temp);
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temperature(ctx, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
llama_sample_top_k (ctx, &cur_p, top_k, 1);
llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
llama_sample_typical (ctx, &cur_p, typical_p, 1);
llama_sample_top_p (ctx, &cur_p, top_p, 1);
llama_sample_temperature(ctx, &cur_p, temp);
{
const int n_top = 10;
LOG("top %d candidates:\n", n_top);
for (int i = 0; i < n_top; i++) {
const llama_token id = cur_p.data[i].id;
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
}
}
id = llama_sample_token(ctx, &cur_p);
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
}
}
// printf("`%d`", candidates_p.size);
if (grammar != NULL) {
llama_grammar_accept_token(ctx, grammar, id);
}
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_n_tokens));
}
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
embd.push_back(id);
@ -766,8 +661,8 @@ int main(int argc, char ** argv) {
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(embd_inp[n_consumed]);
last_tokens.erase(last_tokens.begin());
last_tokens.push_back(embd_inp[n_consumed]);
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
break;
@ -798,9 +693,9 @@ int main(int argc, char ** argv) {
// if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) {
// check for reverse prompt
if (params.antiprompt.size()) {
if (!params.antiprompt.empty()) {
std::string last_output;
for (auto id : last_n_tokens) {
for (auto id : last_tokens) {
last_output += llama_token_to_piece(ctx, id);
}
@ -831,11 +726,11 @@ int main(int argc, char ** argv) {
}
// deal with end of text token in interactive mode
if (last_n_tokens.back() == llama_token_eos(ctx)) {
if (last_tokens.back() == llama_token_eos(ctx)) {
LOG("found EOS token\n");
if (params.interactive) {
if (params.antiprompt.size() != 0) {
if (!params.antiprompt.empty()) {
// tokenize and inject first reverse prompt
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
@ -933,7 +828,7 @@ int main(int argc, char ** argv) {
if (grammar != NULL) {
llama_grammar_free(grammar);
std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(),
parsed_grammar.symbol_ids.at("root"));

View file

@ -368,7 +368,7 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
// Example, we have a context window of 512, we will compute perplexity for each of the
// last 256 tokens. Then, we split the input up into context window size chunks to
// process the entire prompt.
const int first = std::min(512, params.n_ctx/2);
const int first = params.n_ctx/2;
process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
count += params.n_ctx - first - 1;
@ -655,7 +655,7 @@ int main(int argc, char ** argv) {
gpt_params params;
params.n_batch = 512;
if (gpt_params_parse(argc, argv, params) == false) {
if (!gpt_params_parse(argc, argv, params)) {
return 1;
}
@ -668,11 +668,6 @@ int main(int argc, char ** argv) {
params.n_ctx += params.ppl_stride/2;
}
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed == LLAMA_DEFAULT_SEED) {
@ -698,6 +693,12 @@ int main(int argc, char ** argv) {
return 1;
}
const int n_ctx_train = llama_n_ctx_train(ctx);
if (params.n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
}
// print system information
{
fprintf(stderr, "\n");

View file

@ -71,7 +71,7 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
}
// Check if a layer is included/excluded by command line
bool layer_included(const quantize_stats_params params, const std::string & layer) {
bool layer_included(const quantize_stats_params & params, const std::string & layer) {
for (const auto& excluded : params.exclude_layers) {
if (std::regex_search(layer, std::regex(excluded))) {
return false;

View file

@ -35,6 +35,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
};
@ -71,12 +73,17 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
//
void usage(const char * executable) {
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
fprintf(stderr, "\nAllowed quantization types:\n");
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
printf("\nAllowed quantization types:\n");
for (auto & it : QUANT_OPTIONS) {
printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
if (it.name != "COPY") {
printf(" %2d or ", it.ftype);
} else {
printf(" ");
}
printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str());
}
exit(1);
}
@ -121,6 +128,9 @@ int main(int argc, char ** argv) {
// export as [inp path]/ggml-model-[ftype].gguf
fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
arg_idx++;
if (ftype_str == "COPY") {
params.only_copy = true;
}
}
else {
fname_out = argv[arg_idx];
@ -134,6 +144,9 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
return 1;
}
if (ftype_str == "COPY") {
params.only_copy = true;
}
arg_idx++;
}

View file

@ -13,7 +13,7 @@ int main(int argc, char ** argv) {
params.repeat_last_n = 64;
params.prompt = "The quick brown fox";
if (gpt_params_parse(argc, argv, params) == false) {
if (!gpt_params_parse(argc, argv, params)) {
return 1;
}
@ -44,7 +44,7 @@ int main(int argc, char ** argv) {
llama_free_model(model);
return 1;
}
auto tokens = llama_tokenize(ctx, params.prompt.c_str(), true);
auto tokens = llama_tokenize(ctx, params.prompt, true);
auto n_prompt_tokens = tokens.size();
if (n_prompt_tokens < 1) {
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);

File diff suppressed because it is too large Load diff

View file

@ -145,7 +145,29 @@
color: #888;
}
@keyframes loading-bg-wipe {
0% {
background-position: 0%;
}
100% {
background-position: 100%;
}
}
.loading {
--loading-color-1: #eeeeee00;
--loading-color-2: #eeeeeeff;
background-size: 50% 100%;
background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1));
animation: loading-bg-wipe 2s linear infinite;
}
@media (prefers-color-scheme: dark) {
.loading {
--loading-color-1: #22222200;
--loading-color-2: #222222ff;
}
.popover-content {
background-color: black;
}
@ -321,7 +343,10 @@
const llamaStats = signal(null)
const controller = signal(null)
const generating = computed(() => controller.value == null )
// currently generating a completion?
const generating = computed(() => controller.value != null)
// has the user started a chat?
const chatStarted = computed(() => session.value.transcript.length > 0)
const transcriptUpdate = (transcript) => {
@ -430,11 +455,19 @@
return html`
<form onsubmit=${submit}>
<div>
<textarea type="text" rows=2 onkeypress=${enterSubmits} value="${message}" oninput=${(e) => message.value = e.target.value} placeholder="Say something..."/>
<textarea
className=${generating.value ? "loading" : null}
oninput=${(e) => message.value = e.target.value}
onkeypress=${enterSubmits}
placeholder="Say something..."
rows=2
type="text"
value="${message}"
/>
</div>
<div class="right">
<button type="submit" disabled=${!generating.value} >Send</button>
<button onclick=${stop} disabled=${generating}>Stop</button>
<button type="submit" disabled=${generating.value}>Send</button>
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
<button onclick=${reset}>Reset</button>
</div>
</form>

View file

@ -17,6 +17,8 @@
#include "completion.js.hpp"
#include "json-schema-to-grammar.mjs.hpp"
#include <cstddef>
#ifndef SERVER_VERBOSE
#define SERVER_VERBOSE 1
#endif
@ -116,7 +118,7 @@ static void server_log(const char *level, const char *function, int line,
}
const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
fprintf(stdout, "%.*s\n", (int)str.size(), str.data());
printf("%.*s\n", (int)str.size(), str.data());
fflush(stdout);
}
@ -137,7 +139,7 @@ static std::string tokens_to_output_formatted_string(const llama_context *ctx, c
}
// convert a vector of completion_token_output to json
static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> probs)
static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> & probs)
{
json out = json::array();
for (const auto &prob : probs)
@ -269,7 +271,7 @@ struct llama_server_context
return true;
}
std::vector<llama_token> tokenize(json json_prompt, bool add_bos)
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
{
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
// or the first element of the json_prompt array is a string.
@ -609,7 +611,7 @@ struct llama_server_context
completion_token_output doCompletion()
{
const completion_token_output token_with_probs = nextToken();
auto token_with_probs = nextToken();
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
generated_text += token_text;
@ -692,50 +694,50 @@ struct llama_server_context
static void server_print_usage(const char *argv0, const gpt_params &params,
const server_params &sparams)
{
fprintf(stdout, "usage: %s [options]\n", argv0);
fprintf(stdout, "\n");
fprintf(stdout, "options:\n");
fprintf(stdout, " -h, --help show this help message and exit\n");
fprintf(stdout, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n");
printf("usage: %s [options]\n", argv0);
printf("\n");
printf("options:\n");
printf(" -h, --help show this help message and exit\n");
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
printf(" --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
if (llama_mlock_supported())
{
fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
}
if (llama_mmap_supported())
{
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
}
fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n");
printf(" --numa attempt optimizations that help on some NUMA systems\n");
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stdout, " -ngl N, --n-gpu-layers N\n");
fprintf(stdout, " number of layers to store in VRAM\n");
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
printf(" -ngl N, --n-gpu-layers N\n");
printf(" number of layers to store in VRAM\n");
printf(" -ts SPLIT --tensor-split SPLIT\n");
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n");
printf(" -nommq, --no-mul-mat-q\n");
printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
printf(" Not recommended since this is both slower and uses more VRAM.\n");
#endif
fprintf(stdout, " -m FNAME, --model FNAME\n");
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
fprintf(stdout, " -a ALIAS, --alias ALIAS\n");
fprintf(stdout, " set an alias for the model, will be added as `model` field in completion response\n");
fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stdout, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
fprintf(stdout, " --port PORT port to listen (default (default: %d)\n", sparams.port);
fprintf(stdout, " --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
fprintf(stdout, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
fprintf(stdout, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
fprintf(stdout, "\n");
printf(" -m FNAME, --model FNAME\n");
printf(" model path (default: %s)\n", params.model.c_str());
printf(" -a ALIAS, --alias ALIAS\n");
printf(" set an alias for the model, will be added as `model` field in completion response\n");
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
printf("\n");
}
static void server_params_parse(int argc, char **argv, server_params &sparams,
@ -1038,7 +1040,7 @@ static json format_timings(llama_server_context &llama)
{
const auto timings = llama_get_timings(llama.ctx);
assert(timings.n_eval == llama.num_tokens_predicted);
assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
return json{
{"prompt_n", timings.n_p_eval},
@ -1239,7 +1241,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
const llama_token * tokens = beams_state.beam_views[0].tokens;
const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map);
printf("%lu", n);
printf("%zu", n);
}
fflush(stdout);
#if 0 // DEBUG: print current beams for this iteration
@ -1253,7 +1255,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
struct token_translator {
llama_context * ctx;
std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); }
std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); }
};
void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {
@ -1377,7 +1379,13 @@ int main(int argc, char **argv)
}
}
const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs);
auto probs = llama.generated_token_probs;
if (llama.params.n_probs > 0 && llama.stopped_word) {
const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
}
const json data = format_final_response(llama, llama.generated_text, probs);
llama_print_timings(llama.ctx);
@ -1454,7 +1462,11 @@ int main(int argc, char **argv)
if (!llama.has_next_token) {
// Generation is done, send extra information.
const json data = format_final_response(llama, "", llama.generated_token_probs);
const json data = format_final_response(
llama,
"",
std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.begin() + sent_token_probs_index)
);
const std::string str =
"data: " +
@ -1548,7 +1560,7 @@ int main(int argc, char **argv)
svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep)
{
const auto * fmt = "500 Internal Server Error\n%s";
const char fmt[] = "500 Internal Server Error\n%s";
char buf[BUFSIZ];
try {
std::rethrow_exception(std::move(ep));
@ -1583,7 +1595,7 @@ int main(int argc, char **argv)
svr.set_base_dir(sparams.public_path);
// to make it ctrl+clickable:
fprintf(stdout, "\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
printf("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
LOG_INFO("HTTP server listening", {
{"hostname", sparams.hostname},

View file

@ -1,7 +1,3 @@
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "build-info.h"
#include "common.h"

View file

@ -0,0 +1,8 @@
set(TARGET speculative)
add_executable(${TARGET} speculative.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -0,0 +1,288 @@
#include "build-info.h"
#include "common.h"
#include "llama.h"
#include "grammar-parser.h"
#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
int main(int argc, char ** argv) {
gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
if (params.model_draft.empty()) {
fprintf(stderr, "%s: error: --model-draft is required\n", __func__);
return 1;
}
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("speculative", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
// init llama.cpp
llama_backend_init(params.numa);
llama_model * model_tgt = NULL;
llama_model * model_dft = NULL;
llama_context * ctx_tgt = NULL;
llama_context * ctx_dft = NULL;
// load the target model
params.perplexity = true; // HACK: enable logits_all = true
std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
// load the draft model
params.model = params.model_draft;
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
// tokenize the prompt
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
const int max_context_size = llama_n_ctx(ctx_tgt);
const int max_tokens_list_size = max_context_size - 4;
if ((int) inp.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
return 1;
}
fprintf(stderr, "\n\n");
for (auto id : inp) {
fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
}
fflush(stderr);
const int n_input = inp.size();
const auto t_enc_start = ggml_time_us();
// eval the prompt with both models
llama_eval(ctx_tgt, inp.data(), int(inp.size() - 1), 0, params.n_threads);
llama_eval(ctx_tgt, &inp.back(), 1, inp.size() - 1, params.n_threads);
llama_eval(ctx_dft, inp.data(), int(inp.size()), 0, params.n_threads);
const auto t_enc_end = ggml_time_us();
// the 2 models should have the same vocab
const int n_ctx = llama_n_ctx(ctx_tgt);
const int n_vocab = llama_n_vocab(ctx_tgt);
//GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft));
// how many tokens to draft each time
const int n_draft = params.n_draft;
int n_predict = 0;
int n_drafted = 0;
int n_accept = 0;
int n_past_tgt = inp.size();
int n_past_dft = inp.size();
std::vector<llama_token> drafted;
std::vector<llama_token> last_tokens(n_ctx);
std::fill(last_tokens.begin(), last_tokens.end(), 0);
for (auto & id : inp) {
last_tokens.erase(last_tokens.begin());
last_tokens.push_back(id);
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
// used to determine end of generation
bool has_eos = false;
// grammar stuff
struct llama_grammar * grammar_dft = NULL;
struct llama_grammar * grammar_tgt = NULL;
grammar_parser::parse_state parsed_grammar;
// if requested - load the grammar, error checking is omitted for brevity
if (!params.grammar.empty()) {
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
// will be empty (default) if there are parse errors
if (parsed_grammar.rules.empty()) {
return 1;
}
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
}
const auto t_dec_start = ggml_time_us();
while (true) {
LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
int i_dft = 0;
while (true) {
// sample from the target model
const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
// remember which tokens were sampled - used for repetition penalties during sampling
last_tokens.erase(last_tokens.begin());
last_tokens.push_back(id);
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, last_tokens));
const std::string token_str = llama_token_to_piece(ctx_tgt, id);
printf("%s", token_str.c_str());
fflush(stdout);
if (id == llama_token_eos(ctx_tgt)) {
has_eos = true;
}
++n_predict;
// check if the draft matches the target
if (i_dft < (int) drafted.size() && id == drafted[i_dft]) {
LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
++n_accept;
++n_past_tgt;
++n_past_dft;
++i_dft;
continue;
}
// the drafted token was rejected or we are out of drafted tokens
if (i_dft < (int) drafted.size()) {
LOG("the %dth drafted token (%d, '%s') does not match the sampled target token (%d, '%s') - rejected\n",
i_dft, drafted[i_dft], llama_token_to_piece(ctx_dft, drafted[i_dft]).c_str(), id, token_str.c_str());
} else {
LOG("out of drafted tokens\n");
}
llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
++n_past_dft;
drafted.clear();
drafted.push_back(id);
break;
}
if (n_predict > params.n_predict || has_eos) {
break;
}
if (grammar_tgt) {
if (grammar_dft) {
llama_grammar_free(grammar_dft);
}
grammar_dft = llama_grammar_copy(grammar_tgt);
LOG("copied target grammar to draft grammar\n");
}
// sample n_draft tokens from the draft model using greedy decoding
int n_past_cur = n_past_dft;
for (int i = 0; i < n_draft; ++i) {
float * logits = llama_get_logits(ctx_dft);
candidates.clear();
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
if (grammar_dft != NULL) {
llama_sample_grammar(ctx_dft, &cur_p, grammar_dft);
}
// computes softmax and sorts the candidates
llama_sample_softmax(ctx_dft, &cur_p);
for (int i = 0; i < 3; ++i) {
LOG(" - draft candidate %3d: %6d (%8.3f) '%s'\n", i, cur_p.data[i].id, cur_p.data[i].p, llama_token_to_piece(ctx_dft, cur_p.data[i].id).c_str());
}
// TODO: better logic?
if (cur_p.data[0].p < 2*cur_p.data[1].p) {
LOG("stopping drafting, probability too low: %.3f < 2*%.3f\n", cur_p.data[0].p, cur_p.data[1].p);
break;
}
// drafted token
const llama_token id = cur_p.data[0].id;
drafted.push_back(id);
++n_drafted;
// no need to evaluate the last drafted token, since we won't use the result
if (i == n_draft - 1) {
break;
}
// evaluate the drafted token on the draft model
llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads);
++n_past_cur;
if (grammar_dft != NULL) {
llama_grammar_accept_token(ctx_dft, grammar_dft, id);
}
}
// evaluate the target model on the drafted tokens
llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads);
++n_past_tgt;
// the first token is always proposed by the traget model before the speculation loop
drafted.erase(drafted.begin());
}
auto t_dec_end = ggml_time_us();
LOG_TEE("\n\n");
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
// TODO: make sure these numbers are computed correctly
LOG_TEE("\n");
LOG_TEE("n_draft = %d\n", n_draft);
LOG_TEE("n_predict = %d\n", n_predict);
LOG_TEE("n_drafted = %d\n", n_drafted);
LOG_TEE("n_accept = %d\n", n_accept);
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
LOG_TEE("\ndraft:\n");
llama_print_timings(ctx_dft);
LOG_TEE("\ntarget:\n");
llama_print_timings(ctx_tgt);
llama_free(ctx_tgt);
llama_free_model(model_tgt);
llama_free(ctx_dft);
llama_free_model(model_dft);
if (grammar_dft != NULL) {
llama_grammar_free(grammar_dft);
llama_grammar_free(grammar_tgt);
}
llama_backend_free();
fprintf(stderr, "\n\n");
return 0;
}

View file

@ -2,13 +2,16 @@
# train-text-from-scratch checkpoint --> gguf conversion
import argparse
import gguf
import os
import struct
import sys
import numpy as np
from pathlib import Path
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py' / 'gguf'))
import gguf
# gguf constants
LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"

View file

@ -169,10 +169,6 @@ struct my_llama_hparams {
float rope_freq_base = 10000.0f;
float rope_freq_scale = 1.0f;
bool operator!=(const my_llama_hparams& other) const {
return memcmp(this, &other, sizeof(my_llama_hparams));
}
};
struct my_llama_layer {
@ -929,28 +925,6 @@ void get_example_targets_batch(struct llama_context * lctx, const int * train_sa
}
}
#ifdef __GNUC__
#ifdef __MINGW32__
__attribute__((format(gnu_printf, 1, 2)))
#else
__attribute__((format(printf, 1, 2)))
#endif
#endif
static std::string format(const char * fmt, ...) {
va_list ap, ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX);
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), size);
}
int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) {
FILE * fp = std::fopen(filename, "rb");
if (fp == NULL) {
@ -983,10 +957,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
out.resize(size+1);
if (std::fread(buf.data(), size, 1, fp) != 1) {
throw std::runtime_error(std::string("unexpectedly reached end of file"));
die("unexpectedly reached end of file");
}
if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno)));
die_fmt("fread failed: %s", strerror(errno));
}
buf[size] = '\0';
@ -1047,11 +1021,11 @@ void shuffle_ints(int * begin, int * end) {
if (kid >= 0) { \
enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
if (ktype != (type)) { \
throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \
} \
(dst) = func(ctx, kid); \
} else if (req) { \
throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
die_fmt("key not found in model: %s", skey.c_str()); \
} \
}
@ -1136,7 +1110,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
read_tensor_by_name(opt->lbfgs.lms, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
read_tensor_by_name(opt->lbfgs.lmy, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
} else {
throw std::runtime_error("unknown optimizer type\n");
die("unknown optimizer type");
}
}
@ -1315,20 +1289,20 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod
const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST));
if (token_idx == -1) {
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
die("cannot find tokenizer vocab in model file");
}
const uint32_t n_vocab = gguf_get_arr_n(vctx, token_idx);
const int score_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_SCORES));
if (score_idx == -1) {
throw std::runtime_error("cannot find tokenizer scores in model file\n");
die("cannot find tokenizer scores in model file");
}
const float * scores = (const float * ) gguf_get_arr_data(vctx, score_idx);
const int toktype_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE));
if (toktype_idx == -1) {
throw std::runtime_error("cannot find token type list in GGUF file\n");
die("cannot find token type list in GGUF file");
}
const int * toktypes = (const int * ) gguf_get_arr_data(vctx, toktype_idx);
@ -1356,7 +1330,7 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod
// read and copy bpe merges
const int merges_keyidx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_MERGES));
if (merges_keyidx == -1) {
throw std::runtime_error("cannot find tokenizer merges in model file\n");
die("cannot find tokenizer merges in model file");
}
const int n_merges = gguf_get_arr_n(vctx, merges_keyidx);
@ -1988,7 +1962,7 @@ void opt_callback(void * vdata, float * sched) {
float min_sched = params->adam_min_alpha / params->adam_alpha;
*sched = min_sched + *sched * (1.0f - min_sched);
int impr_plot = std::isnan(opt->loss_after) ? 0 : -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
int impr_plot = std::isnan(opt->loss_after) ? 0 : -std::lround(1 + (opt->loss_before - opt->loss_after) * 10.0f);
printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0);
if (data->shuffle_countdown < n_batch) {

View file

@ -93,6 +93,10 @@
type = "app";
program = "${self.packages.${system}.default}/bin/quantize";
};
apps.train-text-from-scratch = {
type = "app";
program = "${self.packages.${system}.default}/bin/train-text-from-scratch";
};
apps.default = self.apps.${system}.llama;
devShells.default = pkgs.mkShell {
buildInputs = [ llama-python ];

View file

@ -6,6 +6,26 @@
#include <stdlib.h>
#include <string.h>
#ifdef __has_include
#if __has_include(<unistd.h>)
#include <unistd.h>
#if defined(_POSIX_MAPPED_FILES)
#include <sys/types.h>
#include <sys/mman.h>
#endif
#endif
#endif
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <memoryapi.h>
#endif
#define UNUSED(x) (void)(x)
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
@ -99,19 +119,24 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
}
#endif
static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
return ggml_nbytes(tensor);
UNUSED(alloc);
}
// check if a tensor is allocated by this buffer
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
void * ptr = tensor->data;
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
}
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
#ifdef GGML_ALLOCATOR_DEBUG
GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
#endif
size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@ -135,14 +160,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
if (best_fit_block == -1) {
// the last block is our last resort
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
max_avail = MAX(max_avail, block->size);
if (block->size >= size) {
best_fit_block = alloc->n_free_blocks - 1;
max_avail = MAX(max_avail, block->size);
} else {
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
__func__, size, max_avail);
GGML_ASSERT(!"not enough space in the buffer");
return;
return;
}
}
struct free_block * block = &alloc->free_blocks[best_fit_block];
@ -177,17 +202,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
}
// this is a very naive implementation, but for our case the number of free blocks should be very small
static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
void * ptr = tensor->data;
if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
if (ggml_allocr_is_own(alloc, tensor) == false) {
// the tensor was not allocated in this buffer
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
// the easiest way to deal with this is just to ignore it
return;
}
size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
@ -281,17 +306,68 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
return alloc;
}
// address and size of the buffer when measuring
// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB
// OS specific functions to allocate and free uncommitted virtual memory
static void * alloc_vmem(size_t size) {
#if defined(_WIN32)
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
#elif defined(_POSIX_MAPPED_FILES)
void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
if (ptr == MAP_FAILED) {
return NULL;
}
return ptr;
#else
// use a fixed address for other platforms
uintptr_t base_addr = (uintptr_t)-size - 0x100;
return (void *)base_addr;
#endif
}
static void free_vmem(void * base_addr, size_t size) {
#if defined(_WIN32)
VirtualFree(base_addr, 0, MEM_RELEASE);
UNUSED(size);
#elif defined(_POSIX_MAPPED_FILES)
munmap(base_addr, size);
#else
// nothing to do
UNUSED(base_addr);
UNUSED(size);
#endif
}
// allocate uncommitted virtual memory to measure the size of the graph
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
// 1TB for 64-bit, 1GB for 32-bit
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
do {
*base_addr = alloc_vmem(*size);
if (*base_addr != NULL) {
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
return;
}
// try again with half the size
*size /= 2;
} while (*size > 0);
GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
}
static void free_measure_vmem(void * base_addr, size_t size) {
free_vmem(base_addr, size);
}
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
void * base_addr;
size_t size;
alloc_measure_vmem(&base_addr, &size);
*alloc = (struct ggml_allocr){
/*.data = */ MEASURE_BASE_ADDR,
/*.size = */ MEASURE_MAX_SIZE,
/*.data = */ base_addr,
/*.size = */ size,
/*.alignment = */ alignment,
/*.n_free_blocks = */ 0,
/*.free_blocks = */ {{0}},
@ -311,6 +387,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
}
void ggml_allocr_free(struct ggml_allocr * alloc) {
if (alloc->measure) {
free_measure_vmem(alloc->data, alloc->size);
}
free(alloc);
}
@ -380,8 +459,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
}
// if the node's data is external, then we cannot re-use it
if ((char *) parent->data < (char *) alloc->data ||
(char *) parent->data >= ((char *) alloc->data + alloc->size)) {
if (ggml_allocr_is_own(alloc, parent) == false) {
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
continue;
}
@ -415,7 +493,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
}
}
static size_t ggml_allocator_alloc_graph_tensors_n(
static size_t ggml_allocr_alloc_graph_tensors_n(
struct ggml_allocr * alloc,
struct ggml_cgraph ** graphs, int n_graphs,
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@ -493,11 +571,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
AT_PRINTF("\n");
}
// update parents
// update immediately if there is no parse_seq
// update only at barriers if there is parse_seq
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
int update_end = alloc->parse_seq_len ? ind : ind + 1;
for (int i = update_start; i < update_end; i++) {
@ -521,12 +598,12 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
view_src_hn->n_views -= 1;
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
ggml_allocator_free_tensor(alloc, view_src);
ggml_allocr_free_tensor(alloc, view_src);
}
}
else {
if (parent->data != node->data) {
ggml_allocator_free_tensor(alloc, parent);
ggml_allocr_free_tensor(alloc, parent);
}
}
}
@ -543,7 +620,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
for (int i = 0; outputs[g][i] != NULL; i++) {
struct ggml_tensor * output = outputs[g][i];
AT_PRINTF("output: %s\n", output->name);
ggml_allocator_free_tensor(alloc, output);
ggml_allocr_free_tensor(alloc, output);
}
}
}
@ -552,5 +629,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
}
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
}

File diff suppressed because it is too large Load diff

View file

@ -63,7 +63,9 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(relu);
GGML_METAL_DECL_KERNEL(gelu);
GGML_METAL_DECL_KERNEL(soft_max);
GGML_METAL_DECL_KERNEL(soft_max_4);
GGML_METAL_DECL_KERNEL(diag_mask_inf);
GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
GGML_METAL_DECL_KERNEL(get_rows_f16);
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
@ -76,6 +78,8 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(rms_norm);
GGML_METAL_DECL_KERNEL(norm);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
@ -116,22 +120,47 @@ static NSString * const msl_library_source = @"see metal.metal";
struct ggml_metal_context * ggml_metal_init(int n_cb) {
metal_printf("%s: allocating\n", __func__);
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
id <MTLDevice> device;
NSString * s;
#if TARGET_OS_OSX
// Show all the Metal device instances in the system
NSArray * devices = MTLCopyAllDevices();
for (device in devices) {
s = [device name];
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
}
#endif
// Pick and show default Metal device
device = MTLCreateSystemDefaultDevice();
s = [device name];
metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
// Configure context
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
ctx->device = device;
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
ctx->device = MTLCreateSystemDefaultDevice();
ctx->queue = [ctx->device newCommandQueue];
ctx->n_buffers = 0;
ctx->concur_list_len = 0;
ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
#if 0
// compile from source string and show compile log
#ifdef GGML_SWIFT
// load the default.metallib file
{
NSError * error = nil;
ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"];
NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath];
NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"];
NSURL * libURL = [NSURL fileURLWithPath:libPath];
// Load the metallib file into a Metal library
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
if (error) {
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL;
@ -192,7 +221,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(relu);
GGML_METAL_ADD_KERNEL(gelu);
GGML_METAL_ADD_KERNEL(soft_max);
GGML_METAL_ADD_KERNEL(soft_max_4);
GGML_METAL_ADD_KERNEL(diag_mask_inf);
GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
GGML_METAL_ADD_KERNEL(get_rows_f16);
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
@ -205,6 +236,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(rms_norm);
GGML_METAL_ADD_KERNEL(norm);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
@ -231,13 +264,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
#undef GGML_METAL_ADD_KERNEL
}
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
#if TARGET_OS_OSX
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
if (ctx->device.maxTransferRate != 0) {
metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
} else {
metal_printf("%s: maxTransferRate = built-in GPU\n", __func__);
}
#endif
return ctx;
}
@ -257,7 +292,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(relu);
GGML_METAL_DEL_KERNEL(gelu);
GGML_METAL_DEL_KERNEL(soft_max);
GGML_METAL_DEL_KERNEL(diag_mask_inf);
GGML_METAL_DEL_KERNEL(soft_max_4);
GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
GGML_METAL_DEL_KERNEL(get_rows_f16);
GGML_METAL_DEL_KERNEL(get_rows_q4_0);
GGML_METAL_DEL_KERNEL(get_rows_q4_1);
@ -270,6 +306,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(rms_norm);
GGML_METAL_DEL_KERNEL(norm);
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
@ -310,7 +348,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
void * ggml_metal_host_malloc(size_t n) {
void * data = NULL;
const int result = posix_memalign((void **) &data, getpagesize(), n);
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
if (result != 0) {
metal_printf("%s: error: posix_memalign failed\n", __func__);
return NULL;
@ -384,7 +422,7 @@ bool ggml_metal_add_buffer(
}
}
const size_t size_page = getpagesize();
const size_t size_page = sysconf(_SC_PAGESIZE);
size_t size_aligned = size;
if ((size_aligned % size_page) != 0) {
@ -437,6 +475,7 @@ bool ggml_metal_add_buffer(
}
}
#if TARGET_OS_OSX
metal_printf(", (%8.2f / %8.2f)",
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
@ -446,6 +485,9 @@ bool ggml_metal_add_buffer(
} else {
metal_printf("\n");
}
#else
metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
#endif
}
return true;
@ -680,6 +722,12 @@ void ggml_metal_graph_compute(
} break;
case GGML_OP_ADD:
{
GGML_ASSERT(ggml_is_contiguous(src0));
// utilize float4
GGML_ASSERT(ne00 % 4 == 0);
const int64_t nb = ne00/4;
if (ggml_nelements(src1) == ne10) {
// src1 is a row
[encoder setComputePipelineState:ctx->pipeline_add_row];
@ -689,14 +737,20 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
[encoder setBytes:&nb length:sizeof(nb) atIndex:3];
const int64_t n = ggml_nelements(dst);
const int64_t n = ggml_nelements(dst)/4;
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_MUL:
{
GGML_ASSERT(ggml_is_contiguous(src0));
// utilize float4
GGML_ASSERT(ne00 % 4 == 0);
const int64_t nb = ne00/4;
if (ggml_nelements(src1) == ne10) {
// src1 is a row
[encoder setComputePipelineState:ctx->pipeline_mul_row];
@ -706,9 +760,9 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
[encoder setBytes:&nb length:sizeof(nb) atIndex:3];
const int64_t n = ggml_nelements(dst);
const int64_t n = ggml_nelements(dst)/4;
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
@ -721,7 +775,7 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
const int64_t n = ggml_nelements(dst);
const int64_t n = ggml_nelements(dst)/4;
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
@ -733,7 +787,7 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
const int64_t n = ggml_nelements(dst)/4;
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
@ -753,7 +807,7 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
const int64_t n = ggml_nelements(dst)/4;
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
@ -767,13 +821,16 @@ void ggml_metal_graph_compute(
{
const int nth = 32;
[encoder setComputePipelineState:ctx->pipeline_soft_max];
if (ne00%4 == 0) {
[encoder setComputePipelineState:ctx->pipeline_soft_max_4];
} else {
[encoder setComputePipelineState:ctx->pipeline_soft_max];
}
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
@ -781,14 +838,23 @@ void ggml_metal_graph_compute(
{
const int n_past = ((int32_t *)(dst->op_params))[0];
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
if (ne00%8 == 0) {
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8];
} else {
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
}
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
[encoder setBytes:&n_past length:sizeof(int) atIndex:4];
[encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
if (ne00%8 == 0) {
[encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
}
else {
[encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
}
} break;
case GGML_OP_MUL_MAT:
{
@ -835,14 +901,23 @@ void ggml_metal_graph_compute(
} else {
int nth0 = 32;
int nth1 = 1;
int nrows = 1;
// use custom matrix x vector kernel
switch (src0t) {
case GGML_TYPE_F16:
{
nth0 = 64;
nth0 = 32;
nth1 = 1;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
if (ne11 * ne12 < 4) {
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
nrows = ne11;
} else {
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
nrows = 4;
}
} break;
case GGML_TYPE_Q4_0:
{
@ -894,8 +969,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1);
nth0 = 2;
nth1 = 32;
nth0 = 4; //1;
nth1 = 8; //32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
} break;
case GGML_TYPE_Q5_K:
@ -943,9 +1018,12 @@ void ggml_metal_graph_compute(
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_Q3_K) {
#ifdef GGML_QKK_64
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@ -959,8 +1037,8 @@ void ggml_metal_graph_compute(
else if (src0t == GGML_TYPE_Q6_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else {
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
int64_t ny = (ne11 + nrows - 1)/nrows;
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
}
} break;
@ -1105,7 +1183,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
} break;
case GGML_OP_DUP:
case GGML_OP_CPY:

View file

@ -25,9 +25,9 @@ typedef struct {
} block_q8_0;
kernel void kernel_add(
device const float * src0,
device const float * src1,
device float * dst,
device const float4 * src0,
device const float4 * src1,
device float4 * dst,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] + src1[tpig];
}
@ -35,18 +35,18 @@ kernel void kernel_add(
// assumption: src1 is a row
// broadcast src1 into src0
kernel void kernel_add_row(
device const float * src0,
device const float * src1,
device float * dst,
constant int64_t & ne00,
device const float4 * src0,
device const float4 * src1,
device float4 * dst,
constant int64_t & nb,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] + src1[tpig % ne00];
dst[tpig] = src0[tpig] + src1[tpig % nb];
}
kernel void kernel_mul(
device const float * src0,
device const float * src1,
device float * dst,
device const float4 * src0,
device const float4 * src1,
device float4 * dst,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] * src1[tpig];
}
@ -54,27 +54,27 @@ kernel void kernel_mul(
// assumption: src1 is a row
// broadcast src1 into src0
kernel void kernel_mul_row(
device const float * src0,
device const float * src1,
device float * dst,
constant int64_t & ne00,
device const float4 * src0,
device const float4 * src1,
device float4 * dst,
constant int64_t & nb,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] * src1[tpig % ne00];
dst[tpig] = src0[tpig] * src1[tpig % nb];
}
kernel void kernel_scale(
device const float * src0,
device float * dst,
device const float4 * src0,
device float4 * dst,
constant float & scale,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] * scale;
}
kernel void kernel_silu(
device const float * src0,
device float * dst,
device const float4 * src0,
device float4 * dst,
uint tpig[[thread_position_in_grid]]) {
float x = src0[tpig];
device const float4 & x = src0[tpig];
dst[tpig] = x / (1.0f + exp(-x));
}
@ -89,10 +89,10 @@ constant float GELU_COEF_A = 0.044715f;
constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
kernel void kernel_gelu(
device const float * src0,
device float * dst,
device const float4 * src0,
device float4 * dst,
uint tpig[[thread_position_in_grid]]) {
float x = src0[tpig];
device const float4 & x = src0[tpig];
// BEWARE !!!
// Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs!
@ -107,7 +107,6 @@ kernel void kernel_soft_max(
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
threadgroup float * buf [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
@ -119,55 +118,67 @@ kernel void kernel_soft_max(
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
// parallel max
buf[tpitg[0]] = -INFINITY;
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
buf[tpitg[0]] = MAX(buf[tpitg[0]], psrc0[i00]);
float lmax = psrc0[tpitg[0]];
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
lmax = MAX(lmax, psrc0[i00]);
}
// reduce
threadgroup_barrier(mem_flags::mem_threadgroup);
for (uint i = ntg[0]/2; i > 0; i /= 2) {
if (tpitg[0] < i) {
buf[tpitg[0]] = MAX(buf[tpitg[0]], buf[tpitg[0] + i]);
}
threadgroup_barrier(mem_flags::mem_threadgroup);
}
// broadcast
if (tpitg[0] == 0) {
buf[0] = buf[0];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
const float max = buf[0];
const float max = simd_max(lmax);
// parallel sum
buf[tpitg[0]] = 0.0f;
float lsum = 0.0f;
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
buf[tpitg[0]] += exp(psrc0[i00] - max);
const float exp_psrc0 = exp(psrc0[i00] - max);
lsum += exp_psrc0;
// Remember the result of exp here. exp is expensive, so we really do not
// whish to compute it twice.
pdst[i00] = exp_psrc0;
}
// reduce
threadgroup_barrier(mem_flags::mem_threadgroup);
for (uint i = ntg[0]/2; i > 0; i /= 2) {
if (tpitg[0] < i) {
buf[tpitg[0]] += buf[tpitg[0] + i];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
}
// broadcast
if (tpitg[0] == 0) {
buf[0] = buf[0];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
const float sum = buf[0];
const float sum = simd_sum(lsum);
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
pdst[i00] = exp(psrc0[i00] - max) / sum;
pdst[i00] /= sum;
}
}
kernel void kernel_soft_max_4(
device const float * src0,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i03 = tgpig[2];
const int64_t i02 = tgpig[1];
const int64_t i01 = tgpig[0];
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
// parallel max
float4 lmax4 = psrc4[tpitg[0]];
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
lmax4 = fmax(lmax4, psrc4[i00]);
}
float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
const float max = simd_max(lmax);
// parallel sum
float4 lsum4 = 0.0f;
for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
const float4 exp_psrc4 = exp(psrc4[i00] - max);
lsum4 += exp_psrc4;
pdst4[i00] = exp_psrc4;
}
float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
const float sum = simd_sum(lsum);
for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
pdst4[i00] /= sum;
}
}
@ -186,6 +197,33 @@ kernel void kernel_diag_mask_inf(
dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
} else {
dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
}
}
kernel void kernel_diag_mask_inf_8(
device const float4 * src0,
device float4 * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int & n_past,
uint3 tpig[[thread_position_in_grid]]) {
const int64_t i = 2*tpig[0];
dst[i+0] = src0[i+0];
dst[i+1] = src0[i+1];
int64_t i4 = 4*i;
const int64_t i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01;
const int64_t i01 = i4/(ne00); i4 -= i01*ne00;
const int64_t i00 = i4;
for (int k = 3; k >= 0; --k) {
if (i00 + 4 + k <= n_past + i01) {
break;
}
dst[i+1][k] = -INFINITY;
if (i00 + k > n_past + i01) {
dst[i][k] = -INFINITY;
}
}
}
@ -214,25 +252,17 @@ kernel void kernel_norm(
}
threadgroup_barrier(mem_flags::mem_threadgroup);
}
// broadcast
if (tpitg == 0) {
sum[0] /= ne00;
}
const float mean = sum[0] / ne00;
// recenter and VARIANCE
threadgroup_barrier(mem_flags::mem_threadgroup);
const float mean = sum[0];
// recenter
device float * y = dst + tgpig*ne00;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
y[i00] = x[i00] - mean;
}
// VARIANCE
// parallel sum
sum[tpitg] = 0.0f;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
y[i00] = x[i00] - mean;
sum[tpitg] += y[i00] * y[i00];
}
// reduce
threadgroup_barrier(mem_flags::mem_threadgroup);
for (uint i = ntg/2; i > 0; i /= 2) {
@ -241,12 +271,7 @@ kernel void kernel_norm(
}
threadgroup_barrier(mem_flags::mem_threadgroup);
}
// broadcast
if (tpitg == 0) {
sum[0] /= ne00;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
const float variance = sum[0];
const float variance = sum[0] / ne00;
const float scale = 1.0f/sqrt(variance + eps);
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@ -254,7 +279,6 @@ kernel void kernel_norm(
}
}
kernel void kernel_rms_norm(
device const void * src0,
device float * dst,
@ -435,6 +459,8 @@ kernel void kernel_mul_mat_q4_1_f32(
mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
}
#define NB_Q8_0 8
kernel void kernel_mul_mat_q8_0_f32(
device const void * src0,
device const float * src1,
@ -463,30 +489,30 @@ kernel void kernel_mul_mat_q8_0_f32(
device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
float yl[16];
float yl[NB_Q8_0];
float sumf[nr]={0.f};
const int ix = tiisg/2;
const int il = tiisg%2;
const int ix = tiisg/4;
const int il = tiisg%4;
device const float * yb = y + ix * QK8_0 + 16*il;
device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
// each thread in a SIMD group deals with half a block.
for (int ib = ix; ib < nb; ib += nw/2) {
for (int i = 0; i < 16; ++i) {
// each thread in a SIMD group deals with NB_Q8_0 quants at a time
for (int ib = ix; ib < nb; ib += nw/4) {
for (int i = 0; i < NB_Q8_0; ++i) {
yl[i] = yb[i];
}
for (int row = 0; row < nr; row++) {
device const int8_t * qs = x[ib+row*nb].qs + 16*il;
device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
float sumq = 0.f;
for (int iq = 0; iq < 16; ++iq) {
for (int iq = 0; iq < NB_Q8_0; ++iq) {
sumq += qs[iq] * yl[iq];
}
sumf[row] += sumq*x[ib+row*nb].d;
}
yb += QK8_0 * 16;
yb += NB_Q8_0 * nw;
}
for (int row = 0; row < nr; ++row) {
@ -497,6 +523,60 @@ kernel void kernel_mul_mat_q8_0_f32(
}
}
kernel void kernel_mul_mat_f16_f32_1row(
device const char * src0,
device const char * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]]) {
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
const int64_t im = tgpig.z;
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
float sumf = 0;
if (ne00 < 128) {
for (int i = tiisg; i < ne00; i += 32) {
sumf += (float) x[i] * (float) y[i];
}
float all_sum = simd_sum(sumf);
if (tiisg == 0) {
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
}
} else {
device const half4 * x4 = (device const half4 *) x;
device const float4 * y4 = (device const float4 *) y;
for (int i = tiisg; i < ne00/4; i += 32) {
for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
}
float all_sum = simd_sum(sumf);
if (tiisg == 0) {
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
}
}
}
#define N_F16_F32 4
kernel void kernel_mul_mat_f16_f32(
device const char * src0,
device const char * src1,
@ -515,36 +595,99 @@ kernel void kernel_mul_mat_f16_f32(
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
threadgroup float * sum [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpig[[thread_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 tptg[[threads_per_threadgroup]]) {
uint tiisg[[thread_index_in_simdgroup]]) {
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
const int64_t rb = tgpig.y*N_F16_F32;
const int64_t im = tgpig.z;
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
sum[tpitg.x] = 0.0f;
if (ne00 < 128) {
for (int row = 0; row < N_F16_F32; ++row) {
int r1 = rb + row;
if (r1 >= ne11) {
break;
}
for (int i = tpitg.x; i < ne00; i += tptg.x) {
sum[tpitg.x] += (float) x[i] * (float) y[i];
}
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
// accumulate the sum from all threads in the threadgroup
threadgroup_barrier(mem_flags::mem_threadgroup);
for (uint i = tptg.x/2; i > 0; i /= 2) {
if (tpitg.x < i) {
sum[tpitg.x] += sum[tpitg.x + i];
float sumf = 0;
for (int i = tiisg; i < ne00; i += 32) {
sumf += (float) x[i] * (float) y[i];
}
float all_sum = simd_sum(sumf);
if (tiisg == 0) {
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
}
} else {
device const half4 * x4 = (device const half4 *)x;
for (int row = 0; row < N_F16_F32; ++row) {
int r1 = rb + row;
if (r1 >= ne11) {
break;
}
if (tpitg.x == 0) {
dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
device const float4 * y4 = (device const float4 *) y;
float sumf = 0;
for (int i = tiisg; i < ne00/4; i += 32) {
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
}
float all_sum = simd_sum(sumf);
if (tiisg == 0) {
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
}
}
}
}
// Assumes row size (ne00) is a multiple of 4
kernel void kernel_mul_mat_f16_f32_l4(
device const char * src0,
device const char * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]]) {
const int nrows = ne11;
const int64_t r0 = tgpig.x;
const int64_t im = tgpig.z;
device const half4 * x4 = (device const half4 *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
for (int r1 = 0; r1 < nrows; ++r1) {
device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12);
float sumf = 0;
for (int i = tiisg; i < ne00/4; i += 32) {
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
}
float all_sum = simd_sum(sumf);
if (tiisg == 0) {
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
}
}
}
@ -614,25 +757,27 @@ kernel void kernel_rope(
constant int & mode,
constant float & freq_base,
constant float & freq_scale,
uint3 tpig[[thread_position_in_grid]]) {
const int64_t i3 = tpig[2];
const int64_t i2 = tpig[1];
const int64_t i1 = tpig[0];
uint tiitg[[thread_index_in_threadgroup]],
uint3 tptg[[threads_per_threadgroup]],
uint3 tgpig[[threadgroup_position_in_grid]]) {
const int64_t i3 = tgpig[2];
const int64_t i2 = tgpig[1];
const int64_t i1 = tgpig[0];
const bool is_neox = mode & 2;
const float theta_scale = pow(freq_base, -2.0f/n_dims);
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
float theta = freq_scale * (float)p;
const float theta_0 = freq_scale * (float)p;
const float inv_ndims = -1.f/n_dims;
if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
const float cos_theta = cos(theta);
const float sin_theta = sin(theta);
theta *= theta_scale;
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
@ -644,12 +789,12 @@ kernel void kernel_rope(
}
} else {
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
for (int64_t ic = 0; ic < n_dims; ic += 2) {
for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
const float cos_theta = cos(theta);
const float sin_theta = sin(theta);
theta *= theta_scale;
const int64_t i0 = ib*n_dims + ic/2;
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
@ -1053,31 +1198,40 @@ kernel void kernel_mul_mat_q3_K_f32(
device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb + offset0;
device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1;
float yl[16];
float yl[32];
const uint16_t kmask1 = 0x0303;
const uint16_t kmask1 = 0x3030;
const uint16_t kmask2 = 0x0f0f;
const int tid = tiisg/2;
const int ix = tiisg%2;
const int ip = tid/8; // 0 or 1
const int il = tid/2 - 4*ip; // 0...3
const int tid = tiisg/4;
const int ix = tiisg%4;
const int ip = tid/4; // 0 or 1
const int il = 2*((tid%4)/2); // 0 or 2
const int ir = tid%2;
const int n = 8;
const int l0 = n*ir;
const uint16_t m1 = 1 << (4*ip + il);
const uint16_t m2 = m1 << 8;
// One would think that the Metal compiler would figure out that ip and il can only have
// 4 possible states, and optimize accordingly. Well, no. It needs help, and we do it
// with these two tales.
//
// Possible masks for the high bit
const ushort4 mm[4] = {{0x0001, 0x0100, 0x0002, 0x0200}, // ip = 0, il = 0
{0x0004, 0x0400, 0x0008, 0x0800}, // ip = 0, il = 2
{0x0010, 0x1000, 0x0020, 0x2000}, // ip = 1, il = 0
{0x0040, 0x4000, 0x0080, 0x8000}}; // ip = 1, il = 2
// Possible masks for the low 2 bits
const int4 qm[2] = {{0x0003, 0x0300, 0x000c, 0x0c00}, {0x0030, 0x3000, 0x00c0, 0xc000}};
const ushort4 hm = mm[2*ip + il/2];
const int shift = 2*il;
const uint16_t qm1 = 0x0003 << shift;
const uint16_t qm2 = 0x0300 << shift;
const int32_t v1 = 4 << shift;
const int32_t v2 = 1024 << shift;
const float v1 = il == 0 ? 4.f : 64.f;
const float v2 = 4.f * v1;
const uint16_t s_shift1 = 4*ip;
const uint16_t s_shift2 = s_shift1 + 2*(il/2);
const int ik = 4 + (il%2);
const uint16_t s_shift2 = s_shift1 + il;
const int q_offset = 32*ip + l0;
const int y_offset = 128*ip + 32*il + l0;
@ -1086,12 +1240,19 @@ kernel void kernel_mul_mat_q3_K_f32(
device const float * y1 = yy + ix*QK_K + y_offset;
float sumf1[2] = {0.f}, sumf2[2] = {0.f};
for (int i = ix; i < nb; i += 2) {
uint32_t scales32, aux32;
thread uint16_t * scales16 = (thread uint16_t *)&scales32;
thread const int8_t * scales = (thread const int8_t *)&scales32;
float sumf1[2] = {0.f};
float sumf2[2] = {0.f};
for (int i = ix; i < nb; i += 4) {
for (int l = 0; l < 8; ++l) {
yl[l+0] = y1[l+ 0];
yl[l+8] = y1[l+16];
yl[l+ 0] = y1[l+ 0];
yl[l+ 8] = y1[l+16];
yl[l+16] = y1[l+32];
yl[l+24] = y1[l+48];
}
device const uint16_t * q = (device const uint16_t *)(x[i].qs + q_offset);
@ -1102,27 +1263,43 @@ kernel void kernel_mul_mat_q3_K_f32(
for (int row = 0; row < 2; ++row) {
const float d_all = (float)dh[0];
const char2 scales = as_type<char2>((uint16_t)(((a[il] >> s_shift1) & kmask2) | (((a[ik] >> s_shift2) & kmask1) << 4)));
float s1 = 0, s2 = 0;
for (int l = 0; l < n; l += 2) {
const uint16_t qs = q[l/2];
s1 += yl[l+0] * ((int32_t)(qs & qm1) - ((h[l/2] & m1) ? 0 : v1));
s2 += yl[l+1] * ((int32_t)(qs & qm2) - ((h[l/2] & m2) ? 0 : v2));
}
float d = d_all * (s1 + 1.f/256.f * s2);
sumf1[row] += d * scales[0];
sumf2[row] += d;
scales16[0] = a[4];
scales16[1] = a[5];
aux32 = ((scales32 >> s_shift2) << 4) & 0x30303030;
scales16[0] = a[il+0];
scales16[1] = a[il+1];
scales32 = ((scales32 >> s_shift1) & 0x0f0f0f0f) | aux32;
s1 = s2 = 0;
float s1 = 0, s2 = 0, s3 = 0, s4 = 0, s5 = 0, s6 = 0;
for (int l = 0; l < n; l += 2) {
const uint16_t qs = q[l/2+8];
s1 += yl[l+8] * ((int32_t)(qs & qm1) - ((h[l/2+8] & m1) ? 0 : v1));
s2 += yl[l+9] * ((int32_t)(qs & qm2) - ((h[l/2+8] & m2) ? 0 : v2));
const int32_t qs = q[l/2];
s1 += yl[l+0] * (qs & qm[il/2][0]);
s2 += yl[l+1] * (qs & qm[il/2][1]);
s3 += ((h[l/2] & hm[0]) ? 0.f : yl[l+0]) + ((h[l/2] & hm[1]) ? 0.f : yl[l+1]);
s4 += yl[l+16] * (qs & qm[il/2][2]);
s5 += yl[l+17] * (qs & qm[il/2][3]);
s6 += ((h[l/2] & hm[2]) ? 0.f : yl[l+16]) + ((h[l/2] & hm[3]) ? 0.f : yl[l+17]);
}
d = d_all * (s1 + 1.f/256.f * s2);
sumf1[row] += d * scales[1];
sumf2[row] += d;
float d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1);
float d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2);
sumf1[row] += d1 * (scales[0] - 32);
sumf2[row] += d2 * (scales[2] - 32);
s1 = s2 = s3 = s4 = s5 = s6 = 0;
for (int l = 0; l < n; l += 2) {
const int32_t qs = q[l/2+8];
s1 += yl[l+8] * (qs & qm[il/2][0]);
s2 += yl[l+9] * (qs & qm[il/2][1]);
s3 += ((h[l/2+8] & hm[0]) ? 0.f : yl[l+8]) + ((h[l/2+8] & hm[1]) ? 0.f : yl[l+9]);
s4 += yl[l+24] * (qs & qm[il/2][2]);
s5 += yl[l+25] * (qs & qm[il/2][3]);
s6 += ((h[l/2+8] & hm[2]) ? 0.f : yl[l+24]) + ((h[l/2+8] & hm[3]) ? 0.f : yl[l+25]);
}
d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1);
d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2);
sumf1[row] += d1 * (scales[1] - 32);
sumf2[row] += d2 * (scales[3] - 32);
q += step;
h += step;
@ -1131,17 +1308,20 @@ kernel void kernel_mul_mat_q3_K_f32(
}
y1 += 2 * QK_K;
y1 += 4 * QK_K;
}
for (int row = 0; row < 2; ++row) {
const float sumf = (sumf1[row] - 32.f*sumf2[row]) / (1 << shift);
const float tot = simd_sum(sumf);
if (tiisg == 0) {
dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = tot;
const float sumf = (sumf1[row] + 0.25f * sumf2[row]) / (1 << shift);
sumf1[row] = simd_sum(sumf);
}
if (tiisg == 0) {
for (int row = 0; row < 2; ++row) {
dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row];
}
}
}
#else
kernel void kernel_mul_mat_q3_K_f32(
@ -1244,7 +1424,8 @@ kernel void kernel_mul_mat_q4_K_f32(
const int r0 = tgpig.x;
const int r1 = tgpig.y;
const int r2 = tgpig.z;
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
//const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
const int first_row = r0 * N_DST;
const int ib_row = first_row * nb;
const uint offset0 = r2/gqa*(nb*ne0);
device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
@ -1493,17 +1674,25 @@ kernel void kernel_mul_mat_q5_K_f32(
sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2);
sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2);
float4 acc = {0.f, 0.f, 0.f, 0.f};
float4 acc1 = {0.f};
float4 acc2 = {0.f};
for (int l = 0; l < n; ++l) {
uint8_t h = qh[l];
acc[0] += yl[l+0] * ((uint16_t)(q1[l] & 0x0F) + (h & hm1 ? 16 : 0));
acc[1] += yl[l+8] * ((uint16_t)(q1[l] & 0xF0) + (h & hm2 ? 256 : 0));
acc[2] += yh[l+0] * ((uint16_t)(q2[l] & 0x0F) + (h & hm3 ? 16 : 0));
acc[3] += yh[l+8] * ((uint16_t)(q2[l] & 0xF0) + (h & hm4 ? 256 : 0));
acc1[0] += yl[l+0] * (q1[l] & 0x0F);
acc1[1] += yl[l+8] * (q1[l] & 0xF0);
acc1[2] += yh[l+0] * (q2[l] & 0x0F);
acc1[3] += yh[l+8] * (q2[l] & 0xF0);
acc2[0] += h & hm1 ? yl[l+0] : 0.f;
acc2[1] += h & hm2 ? yl[l+8] : 0.f;
acc2[2] += h & hm3 ? yh[l+0] : 0.f;
acc2[3] += h & hm4 ? yh[l+8] : 0.f;
}
const float dall = dh[0];
const float dmin = dh[1];
sumf[row] += dall * (acc[0] * sc8[0] + acc[1] * sc8[1] * 1.f/16.f + acc[2] * sc8[4] + acc[3] * sc8[5] * 1.f/16.f) -
sumf[row] += dall * (sc8[0] * (acc1[0] + 16.f*acc2[0]) +
sc8[1] * (acc1[1]/16.f + 16.f*acc2[1]) +
sc8[4] * (acc1[2] + 16.f*acc2[2]) +
sc8[5] * (acc1[3]/16.f + 16.f*acc2[3])) -
dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
q1 += step;
@ -1686,29 +1875,34 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg)
template <typename type4x4>
void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
const half d = il ? (xb->d / 16.h) : xb->d;
const half m = il ? ( -8.h * 16.h) : -8.h;
const float d1 = il ? (xb->d / 16.h) : xb->d;
const float d2 = d1 / 256.f;
const float md = -8.h * xb->d;
const ushort mask0 = il ? 0x00F0 : 0x000F;
const ushort mask1 = il ? 0xF000 : 0x0F00;
const ushort mask1 = mask0 << 8;
for (int i=0;i<8;i++) {
reg[i/2][2*(i%2)] = (((qs[i] & mask0) ) + m) * d;
reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) + m) * d;
reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
}
}
template <typename type4x4>
void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
device const uint16_t * qs = ((device const uint16_t *)xb + 2);
const half d = il ? (xb->d / 16.h) : xb->d;
const half m = xb->m;
const float d1 = il ? (xb->d / 16.h) : xb->d;
const float d2 = d1 / 256.f;
const float m = xb->m;
const ushort mask0 = il ? 0x00F0 : 0x000F;
const ushort mask1 = il ? 0xF000 : 0x0F00;
const ushort mask1 = mask0 << 8;
for (int i=0;i<8;i++) {
reg[i/2][2*(i%2)] = (((qs[i] & mask0) ) * d) + m;
reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) * d) + m;
reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m;
reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
}
}
@ -1744,7 +1938,7 @@ void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg
template <typename type4x4>
void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) {
const float d_all = (float)(xb->d);
const half d_all = xb->d;
device const uint8_t * q = (device const uint8_t *)xb->qs;
device const uint8_t * h = (device const uint8_t *)xb->hmask;
device const int8_t * scales = (device const int8_t *)xb->scales;
@ -1757,17 +1951,20 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
((il/4)>0 ? 12 : 3);
uint16_t kmask2 = il/8 ? 0xF0 : 0x0F;
uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) : \
(scale_2&kmask2) | ((scale_1&kmask1) << 4);
float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f);
int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
: (scale_2&kmask2) | ((scale_1&kmask1) << 4);
half dl = il<8 ? d_all * (dl_int - 32.h) : d_all * (dl_int / 16.h - 32.h);
const half ml = 4.h * dl;
il = (il/2)%4;
float coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
il = (il/2) & 3;
const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
const uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
dl *= coef;
for (int i = 0; i < 16; ++i) {
reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i] & m) ? 0 : 4.f/coef));
reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
}
#else
float kcoef = il&1 ? 1.f/16.f : 1.f;
uint16_t kmask = il&1 ? 0xF0 : 0x0F;
@ -1781,31 +1978,37 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
#endif
}
static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)}
: uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))};
}
template <typename type4x4>
void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
device const uint8_t * q = xb->qs;
device const uchar * q = xb->qs;
#if QK_K == 256
const float d = (float)(xb->d);
const float min = (float)(xb->dmin);
short is = (il/4) * 2;
q = q + (il/4) * 32 + 16 * (il&1);
il = il%4;
const uchar4 sc = get_scale_min_k4(is, xb->scales);
const float dl = il<2 ? d * sc[0] : d * sc[2]/16.h;
const float ml = il<2 ? min * sc[1] : min * sc[3];
il = il & 3;
const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
const half d = il < 2 ? xb->d : xb->d / 16.h;
const half min = xb->dmin;
const half dl = d * sc[0];
const half ml = min * sc[1];
#else
q = q + 16 * (il&1);
device const uint8_t * s = xb->scales;
device const half2 * dh = (device const half2 *)xb->d;
const float2 d = (float2)dh[0];
const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h;
const float ml = il<2 ? d[1] * (s[0]>>4) : d[1 ]* (s[1]>>4);
const float ml = il<2 ? d[1] * (s[0]>>4) : d[1] * (s[1]>>4);
#endif
const ushort mask = il<2 ? 0x0F : 0xF0;
for (int i = 0; i < 16; ++i) {
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
}
}
template <typename type4x4>
@ -1814,19 +2017,19 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
device const uint8_t * qh = xb->qh;
#if QK_K == 256
const float d = (float)(xb->d);
const float min = (float)(xb->dmin);
short is = (il/4) * 2;
q = q + 32 * (il/4) + 16 * (il&1);
qh = qh + 16 * (il&1);
uint8_t ul = 1 << (il/2);
il = il%4;
const uchar4 sc = get_scale_min_k4(is, xb->scales);
const float dl = il<2 ? d * sc[0] : d * sc[2]/16.h;
const float ml = il<2 ? min * sc[1] : min * sc[3];
il = il & 3;
const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
const half d = il < 2 ? xb->d : xb->d / 16.h;
const half min = xb->dmin;
const half dl = d * sc[0];
const half ml = min * sc[1];
const ushort mask = il<2 ? 0x0F : 0xF0;
const float qh_val = il<2 ? 16.f : 256.f;
const ushort mask = il<2 ? 0x0F : 0xF0;
const half qh_val = il<2 ? 16.h : 256.h;
for (int i = 0; i < 16; ++i) {
reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
}
@ -1845,7 +2048,7 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
template <typename type4x4>
void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
const float d_all = (float)(xb->d);
const half d_all = xb->d;
device const uint8_t * ql = (device const uint8_t *)xb->ql;
device const uint8_t * qh = (device const uint8_t *)xb->qh;
device const int8_t * scales = (device const int8_t *)xb->scales;
@ -1853,19 +2056,21 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg
#if QK_K == 256
ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
qh = qh + 32*(il/8) + 16*(il&1);
float sc = scales[(il%2) + 2 * ((il/2))];
il = (il/2)%4;
half sc = scales[(il%2) + 2 * ((il/2))];
il = (il/2) & 3;
#else
ql = ql + 16 * (il&1);
float sc = scales[il];
half sc = scales[il];
#endif
const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
const half coef = il>1 ? 1.f/16.h : 1.h;
const half ml = d_all * sc * 32.h;
const half dl = d_all * sc * coef;
for (int i = 0; i < 16; ++i) {
uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
const float coef = il>1 ? 1.f/16.f : 1.f;
float q = il&1 ? ((ql[i]&kmask2)|((qh[i]&kmask1)<<2)) - 32.f/coef : \
((ql[i]&kmask2)|((qh[i]&kmask1)<<4)) - 32.f/coef;
reg[i/4][i%4] = d_all * sc * q * coef;
const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
: ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));
reg[i/4][i%4] = dl * q - ml;
}
}

View file

@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
return;
}
cl_mem mem = (cl_mem)tensor->data;
cl_mem mem = (cl_mem)tensor->extra;
clReleaseMemObject(mem);
}
@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
size_t d_size;
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
@ -1491,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
size_t d_size;
cl_mem d_X;
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
d_X = (cl_mem) src0->data;
d_X = (cl_mem) src0->extra;
} else {
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
}
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
size_t d_size;
cl_mem d_X;
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
d_X = (cl_mem) src0->data;
d_X = (cl_mem) src0->extra;
} else {
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
}
@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
events.emplace_back();
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
} else if (src0->backend == GGML_BACKEND_GPU) {
d_Q = (cl_mem) src0->data;
d_Q = (cl_mem) src0->extra;
} else {
GGML_ASSERT(false);
}
@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
CL_CHECK(clFinish(queue));
tensor->data = dst;
tensor->extra = dst;
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
}

313
ggml.c
View file

@ -1,4 +1,3 @@
#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
#include "ggml.h"
@ -47,6 +46,10 @@
// disable "possible loss of data" to avoid hundreds of casts
// we should just be careful :)
#pragma warning(disable: 4244 4267)
// disable POSIX deprecation warnigns
// these functions are never going away, anyway
#pragma warning(disable: 4996)
#endif
#if defined(_WIN32)
@ -103,6 +106,9 @@ typedef void * thread_ret_t;
#include <sys/stat.h>
#include <unistd.h>
#endif
#ifdef GGML_USE_CPU_HBM
#include <hbwmalloc.h>
#endif
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@ -192,9 +198,15 @@ typedef void * thread_ret_t;
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
#else
inline static void * ggml_aligned_malloc(size_t size) {
if (size == 0) {
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
return NULL;
}
void * aligned_memory = NULL;
#ifdef GGML_USE_METAL
int result = posix_memalign(&aligned_memory, getpagesize(), size);
#ifdef GGML_USE_CPU_HBM
int result = hbw_posix_memalign(&aligned_memory, 16, size);
#elif GGML_USE_METAL
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
#else
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
#endif
@ -215,8 +227,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
return aligned_memory;
}
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
#ifdef GGML_USE_CPU_HBM
#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
#else
#define GGML_ALIGNED_FREE(ptr) free(ptr)
#endif
#endif
#define UNUSED GGML_UNUSED
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
@ -294,12 +310,18 @@ typedef double ggml_float;
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <intrin.h>
#else
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
#if !defined(__riscv)
#include <immintrin.h>
#endif
#endif
#endif
#endif
#endif
#ifdef __riscv_v_intrinsic
#include <riscv_vector.h>
#endif
#ifdef __F16C__
@ -813,46 +835,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
#if !defined(__aarch64__)
inline static uint16_t vaddvq_u8(uint8x16_t v) {
return
(uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
(uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
(uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
(uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
(uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
(uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
(uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
(uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
}
inline static int16_t vaddvq_s8(int8x16_t v) {
return
(int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
(int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
(int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
(int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
(int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
(int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
(int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
(int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
}
inline static int32_t vaddvq_s16(int16x8_t v) {
return
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
}
inline static uint32_t vaddvq_u16(uint16x8_t v) {
return
(uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
(uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
(uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
(uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
}
inline static int32_t vaddvq_s32(int32x4_t v) {
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
}
@ -861,12 +843,6 @@ inline static float vaddvq_f32(float32x4_t v) {
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
}
inline static float vminvq_f32(float32x4_t v) {
return
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
}
inline static float vmaxvq_f32(float32x4_t v) {
return
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
@ -2677,6 +2653,41 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
}
*s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
#elif defined(__riscv_v_intrinsic)
float sumf = 0.0;
size_t vl = __riscv_vsetvl_e8m1(qk/2);
for (int i = 0; i < nb; i++) {
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
}
*s = sumf;
#else
// scalar
float sumf = 0.0;
@ -2803,6 +2814,38 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
}
*s = hsum_float_8(acc) + summs;
#elif defined(__riscv_v_intrinsic)
float sumf = 0.0;
size_t vl = __riscv_vsetvl_e8m1(qk/2);
for (int i = 0; i < nb; i++) {
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
}
*s = sumf;
#else
// scalar
float sumf = 0.0;
@ -3037,6 +3080,76 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
}
*s = hsum_float_8(acc);
#elif defined(__riscv_v_intrinsic)
float sumf = 0.0;
uint32_t qh;
// These temp values are for masking and shift operations
uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
size_t vl = __riscv_vsetvl_e8m1(qk/2);
for (int i = 0; i < nb; i++) {
memcpy(&qh, x[i].qh, sizeof(uint32_t));
// temporary registers
vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);
// ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl);
vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl);
vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
// ((qh & (1u << (j + 16))) >> (j + 12));
vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl);
vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl);
// narrowing
vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl);
vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl);
vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
// load
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl);
vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl);
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
}
*s = sumf;
#else
// scalar
float sumf = 0.0;
@ -3293,6 +3406,72 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
}
*s = hsum_float_8(acc) + summs;
#elif defined(__riscv_v_intrinsic)
float sumf = 0.0;
uint32_t qh;
// These temp values are for shift operations
uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
size_t vl = __riscv_vsetvl_e8m1(qk/2);
for (int i = 0; i < nb; i++) {
memcpy(&qh, x[i].qh, sizeof(uint32_t));
// temporary registers
vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);
// load qh
vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl);
// ((qh >> (j + 0)) << 4) & 0x10;
vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl);
vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl);
// ((qh >> (j + 12)) ) & 0x10;
vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl);
vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl);
// narrowing
vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl);
vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl);
vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
// load
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
}
*s = sumf;
#else
// scalar
float sumf = 0.0;
@ -3404,6 +3583,26 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
}
*s = hsum_float_8(acc);
#elif defined(__riscv_v_intrinsic)
float sumf = 0.0;
size_t vl = __riscv_vsetvl_e8m1(qk);
for (int i = 0; i < nb; i++) {
// load elements
vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
}
*s = sumf;
#else
// scalar
float sumf = 0.0;
@ -4385,6 +4584,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
return NULL;
}
// allow to call ggml_init with 0 size
if (params.mem_size == 0) {
params.mem_size = GGML_MEM_ALIGN;
}
const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
*ctx = (struct ggml_context) {
@ -4587,7 +4791,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
size_t obj_alloc_size = 0;
if (view_src == NULL && ctx->no_alloc == false) {
if (view_src == NULL && !ctx->no_alloc) {
if (ctx->scratch.data != NULL) {
// allocate tensor data in the scratch buffer
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
@ -5288,7 +5492,7 @@ static struct ggml_tensor * ggml_mul_impl(
}
if (inplace) {
GGML_ASSERT(is_node == false);
GGML_ASSERT(!is_node);
}
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@ -5331,7 +5535,7 @@ static struct ggml_tensor * ggml_div_impl(
}
if (inplace) {
GGML_ASSERT(is_node == false);
GGML_ASSERT(!is_node);
}
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@ -18673,7 +18877,6 @@ static enum ggml_opt_result linesearch_backtracking(
// strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
return count;
}
return count;
}
}
@ -19776,7 +19979,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
struct ggml_tensor * data = NULL;
if (params.no_alloc == false) {
if (!params.no_alloc) {
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
ok = ok && data != NULL;
@ -19817,7 +20020,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
}
// point the data member to the appropriate location in the binary blob using the tensor infos
if (params.no_alloc == false) {
if (!params.no_alloc) {
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
}

View file

@ -1,16 +1,18 @@
#!/usr/bin/env python3
import shutil
import sys
import struct
import tempfile
import numpy as np
from __future__ import annotations
import json
import os
from pathlib import Path
import shutil
import struct
import sys
import tempfile
from enum import IntEnum, auto
from io import BufferedWriter
from typing import Any, BinaryIO, Callable, IO, Dict, List, Optional, Sequence, Tuple, Union
from pathlib import Path
from typing import IO, Any, BinaryIO, Callable, Sequence
import numpy as np
#
# constants
@ -103,7 +105,7 @@ class MODEL_TENSOR(IntEnum):
FFN_NORM : int = auto()
MODEL_ARCH_NAMES: Dict[MODEL_ARCH, str] = {
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.LLAMA: "llama",
MODEL_ARCH.FALCON: "falcon",
MODEL_ARCH.GPT2: "gpt2",
@ -112,7 +114,7 @@ MODEL_ARCH_NAMES: Dict[MODEL_ARCH, str] = {
MODEL_ARCH.MPT: "mpt",
}
MODEL_TENSOR_NAMES: Dict[MODEL_ARCH, Dict[MODEL_TENSOR, str]] = {
MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
MODEL_ARCH.LLAMA: {
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
@ -158,7 +160,7 @@ MODEL_TENSOR_NAMES: Dict[MODEL_ARCH, Dict[MODEL_TENSOR, str]] = {
}
# tensors that will not be serialized
MODEL_TENSOR_SKIP: Dict[MODEL_ARCH, List[MODEL_TENSOR]] = {
MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_ARCH.LLAMA: [
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
@ -167,7 +169,7 @@ MODEL_TENSOR_SKIP: Dict[MODEL_ARCH, List[MODEL_TENSOR]] = {
class TensorNameMap:
mappings_cfg: Dict[MODEL_TENSOR, Tuple[str, ...]] = {
mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
# Token embeddings
MODEL_TENSOR.TOKEN_EMBD: (
"gpt_neox.embed_in", # gptneox
@ -203,7 +205,7 @@ class TensorNameMap:
),
}
block_mappings_cfg: Dict[MODEL_TENSOR, Tuple[str, ...]] = {
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
# Attention norm
MODEL_TENSOR.ATTN_NORM: (
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
@ -298,9 +300,9 @@ class TensorNameMap:
),
}
mapping: Dict[str, Tuple[MODEL_TENSOR, str]]
mapping: dict[str, tuple[MODEL_TENSOR, str]]
tensor_names: Dict[MODEL_TENSOR, str]
tensor_names: dict[MODEL_TENSOR, str]
def __init__(self, arch: MODEL_ARCH, n_blocks: int):
mapping = self.mapping = {}
@ -321,7 +323,7 @@ class TensorNameMap:
key = key.format(bid = bid)
mapping[key] = (tensor, tensor_name)
def get_type_and_name(self, key: str, try_suffixes: Sequence[str]) -> Optional[Tuple[MODEL_TENSOR, str]]:
def get_type_and_name(self, key: str, try_suffixes: Sequence[str]) -> tuple[MODEL_TENSOR, str] | None:
result = self.mapping.get(key)
if result is not None:
return result
@ -332,13 +334,13 @@ class TensorNameMap:
return (result[0], result[1] + suffix)
return None
def get_name(self, key: str, try_suffixes: Sequence[str]) -> Optional[str]:
def get_name(self, key: str, try_suffixes: Sequence[str]) -> str | None:
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
if result is None:
return None
return result[1]
def get_type(self, key: str, try_suffixes: Sequence[str]) -> Optional[MODEL_TENSOR]:
def get_type(self, key: str, try_suffixes: Sequence[str]) -> MODEL_TENSOR | None:
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
if result is None:
return None
@ -432,10 +434,10 @@ class GGUFWriter:
ti_data = b""
ti_data_count = 0
use_temp_file: bool
temp_file: Optional[tempfile.SpooledTemporaryFile[bytes]] = None
tensors: List[Tuple[np.ndarray[Any, Any], int]]
temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
tensors: list[tuple[np.ndarray[Any, Any], int]]
def __init__(self, path: Union[os.PathLike[str], str], arch: str, use_temp_file = True):
def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
self.fout = open(path, "wb")
self.arch = arch
self.add_architecture()
@ -531,7 +533,7 @@ class GGUFWriter:
GGUFValueType.FLOAT64: "<d",
GGUFValueType.BOOL: "?" ,
}
def add_val(self, val: Any, vtype: Optional[GGUFValueType] = None, add_vtype: bool = True):
def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
if vtype is None:
vtype = GGUFValueType.get_type(val)
@ -561,7 +563,7 @@ class GGUFWriter:
def ggml_pad(x: int, n: int) -> int:
return ((x + n - 1) // n) * n
def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype: Union[np.dtype[np.float16], np.dtype[np.float32]], tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32], tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None):
assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
encoded_name = name.encode("utf8")
@ -580,7 +582,7 @@ class GGUFWriter:
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
self.ti_data_count += 1
def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Optional[Sequence[int]] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
if self.use_temp_file and self.temp_file is None:
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
fp.seek(0)
@ -600,7 +602,7 @@ class GGUFWriter:
if pad != 0:
self.temp_file.write(bytes([0] * pad))
def write_padding(self, fp: BinaryIO, n: int, align: Optional[int] = None):
def write_padding(self, fp: BinaryIO, n: int, align: int | None = None):
pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n
if pad != 0:
fp.write(bytes([0] * pad))
@ -726,13 +728,13 @@ class GGUFWriter:
def add_tokenizer_model(self, model: str):
self.add_string(KEY_TOKENIZER_MODEL, model)
def add_token_list(self, tokens: Union[Sequence[str], Sequence[bytes], Sequence[bytearray]]):
def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]):
self.add_array(KEY_TOKENIZER_LIST, tokens)
def add_token_merges(self, merges: Union[Sequence[str], Sequence[bytes], Sequence[bytearray]]):
def add_token_merges(self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]):
self.add_array(KEY_TOKENIZER_MERGES, merges)
def add_token_types(self, types: Union[Sequence[TokenType], Sequence[int]]):
def add_token_types(self, types: Sequence[TokenType] | Sequence[int]):
self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
def add_token_scores(self, scores: Sequence[float]):
@ -756,11 +758,11 @@ class GGUFWriter:
class SpecialVocab:
load_merges: bool = False
merges: List[str] = []
special_token_types: Tuple[str, ...] = tuple(('bos', 'eos', 'unk', 'sep', 'pad'))
special_token_ids: Dict[str, int] = {}
merges: list[str] = []
special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
special_token_ids: dict[str, int] = {}
def __init__(self, path: Path, load_merges: bool = False, special_token_types: Optional[Tuple[str, ...]] = None):
def __init__(self, path: Path, load_merges: bool = False, special_token_types: tuple[str, ...] | None = None):
self.special_token_ids = {}
self.load_merges = load_merges
if special_token_types is not None:
@ -799,7 +801,7 @@ class SpecialVocab:
else:
continue
for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content):
if isinstance(maybe_token_id, int):
if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
self.special_token_ids[typ] = maybe_token_id
break
return True
@ -812,7 +814,7 @@ class SpecialVocab:
config = json.load(f)
for typ in self.special_token_types:
maybe_token_id = config.get(f'{typ}_token_id')
if isinstance(maybe_token_id, int):
if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
self.special_token_ids[typ] = maybe_token_id
return True
@ -821,7 +823,7 @@ class SpecialVocab:
print(f'gguf: Adding {len(self.merges)} merge(s).')
gw.add_token_merges(self.merges)
for typ, tokid in self.special_token_ids.items():
handler: Optional[Callable[[int], None]] = getattr(gw, f'add_{typ}_token_id', None)
handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
if handler is None:
print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping')
continue

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "gguf"
version = "0.2.1"
version = "0.3.2"
description = "Write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"]
packages = [

42
grammars/c.gbnf Normal file
View file

@ -0,0 +1,42 @@
root ::= (declaration)*
declaration ::= dataType identifier "(" parameter? ")" "{" statement* "}"
dataType ::= "int" ws | "float" ws | "char" ws
identifier ::= [a-zA-Z_] [a-zA-Z_0-9]*
parameter ::= dataType identifier
statement ::=
( dataType identifier ws "=" ws expression ";" ) |
( identifier ws "=" ws expression ";" ) |
( identifier ws "(" argList? ")" ";" ) |
( "return" ws expression ";" ) |
( "while" "(" condition ")" "{" statement* "}" ) |
( "for" "(" forInit ";" ws condition ";" ws forUpdate ")" "{" statement* "}" ) |
( "if" "(" condition ")" "{" statement* "}" ("else" "{" statement* "}")? ) |
( singleLineComment ) |
( multiLineComment )
forInit ::= dataType identifier ws "=" ws expression | identifier ws "=" ws expression
forUpdate ::= identifier ws "=" ws expression
condition ::= expression relationOperator expression
relationOperator ::= ("<=" | "<" | "==" | "!=" | ">=" | ">")
expression ::= term (("+" | "-") term)*
term ::= factor(("*" | "/") factor)*
factor ::= identifier | number | unaryTerm | funcCall | parenExpression
unaryTerm ::= "-" factor
funcCall ::= identifier "(" argList? ")"
parenExpression ::= "(" ws expression ws ")"
argList ::= expression ("," ws expression)*
number ::= [0-9]+
singleLineComment ::= "//" [^\n]* "\n"
multiLineComment ::= "/*" ( [^*] | ("*" [^/]) )* "*/"
ws ::= ([ \t\n]+)

34
grammars/json_arr.gbnf Normal file
View file

@ -0,0 +1,34 @@
# This is the same as json.gbnf but we restrict whitespaces at the end of the root array
# Useful for generating JSON arrays
root ::= arr
value ::= object | array | string | number | ("true" | "false" | "null") ws
arr ::=
"[\n" ws (
value
(",\n" ws value)*
)? "]"
object ::=
"{" ws (
string ":" ws value
("," ws string ":" ws value)*
)? "}" ws
array ::=
"[" ws (
value
("," ws value)*
)? "]" ws
string ::=
"\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
# Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= ([ \t\n] ws)?

View file

@ -13,6 +13,26 @@
//
#include <arm_neon.h>
#if !defined(__aarch64__)
inline static int32_t vaddvq_s16(int16x8_t v) {
return
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
}
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
return vcombine_s16(a0, b0);
}
inline static int32_t vaddvq_s32(int32x4_t v) {
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
}
#endif
#else
#ifdef __wasm_simd128__
@ -63,7 +83,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
float ax = fabsf(x[i]);
if (ax > amax) { amax = ax; max = x[i]; }
}
if (!amax) { // all zero
if (amax < 1e-30f) { // all zero
for (int i = 0; i < n; ++i) {
L[i] = 0;
}
@ -183,13 +203,9 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
int ntry, float alpha) {
float min = x[0];
float max = x[0];
float sum_x = 0;
float sum_x2 = 0;
for (int i = 1; i < n; ++i) {
if (x[i] < min) min = x[i];
if (x[i] > max) max = x[i];
sum_x += x[i];
sum_x2 += x[i]*x[i];
}
if (max == min) {
for (int i = 0; i < n; ++i) L[i] = 0;
@ -1070,6 +1086,13 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
}
if (!max_abs_scale) {
memset(&y[i], 0, sizeof(block_q6_K));
y[i].d = ggml_fp32_to_fp16(0.f);
x += QK_K;
continue;
}
float iscale = -128.f/max_scale;
y[i].d = ggml_fp32_to_fp16(1/iscale);
for (int ib = 0; ib < QK_K/16; ++ib) {
@ -1306,7 +1329,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
const uint8x16_t m3 = vdupq_n_u8(0x3);
const uint8x16_t m4 = vdupq_n_u8(0xF);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t vzero = vdupq_n_s32(0);
#endif
int8x16x2_t q2bytes;
uint8_t aux[16];
@ -1612,7 +1637,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
#ifdef __ARM_NEON
const uint8x16_t m3 = vdupq_n_u8(0x3);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t vzero = vdupq_n_s32(0);
#endif
int8x16x4_t q2bytes;
@ -2060,7 +2087,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
__m256 acc = _mm256_setzero_ps();
uint32_t *aux;
const uint32_t *aux;
for (int i = 0; i < nb; ++i) {
@ -2070,7 +2097,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
const int8_t * restrict q8 = y[i].qs;
// Set up scales
aux = (uint32_t *)x[i].scales;
aux = (const uint32_t *)x[i].scales;
__m128i scales128 = _mm_set_epi32(
((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
@ -2596,8 +2623,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
const uint8_t * restrict q4 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
//int32x4_t isum = mzero;
int32_t sumi1 = 0;
int32_t sumi2 = 0;
@ -3096,9 +3121,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
#ifdef __ARM_NEON
const uint8x16_t m4b = vdupq_n_u8(0xf);
const int32x4_t mzero = vdupq_n_s32(0);
const uint8x16_t mone = vdupq_n_u8(1);
const uint8x16_t mtwo = vdupq_n_u8(2);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t mzero = vdupq_n_s32(0);
#endif
int8x16x4_t q5bytes;
@ -3441,8 +3468,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
#ifdef __ARM_NEON
const uint8x16_t m4b = vdupq_n_u8(0xf);
const int32x4_t mzero = vdupq_n_s32(0);
const uint8x16_t mh = vdupq_n_u8(16);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t mzero = vdupq_n_s32(0);
#endif
int8x16x4_t q5bytes;
uint8x16x4_t q5h;
@ -3660,7 +3689,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
float sum = 0;
const uint8x16_t m4b = vdupq_n_u8(0xF);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t vzero = vdupq_n_s32(0);
#endif
//const int8x16_t m32s = vdupq_n_s8(32);
const uint8x16_t mone = vdupq_n_u8(3);
@ -4049,8 +4080,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
float sum = 0;
const uint8x16_t m4b = vdupq_n_u8(0xF);
const int32x4_t vzero = vdupq_n_s32(0);
const int8x16_t m32s = vdupq_n_s8(32);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t vzero = vdupq_n_s32(0);
#endif
const uint8x16_t mone = vdupq_n_u8(3);

262
llama.cpp
View file

@ -1,8 +1,3 @@
// Defines fileno on msys:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "llama.h"
#include "ggml.h"
@ -126,6 +121,9 @@ void replace_all(std::string & s, const std::string & search, const std::string
}
s = std::move(result);
}
#ifdef GGML_USE_CPU_HBM
#include <hbwmalloc.h>
#endif
static void zeros(std::ofstream & file, size_t n) {
char zero = 0;
@ -325,6 +323,44 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_GPT2,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
},
},
{
LLM_ARCH_GPTJ,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
},
},
{
LLM_ARCH_GPTNEOX,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_MPT,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
},
},
{
LLM_ARCH_UNKNOWN,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
},
},
};
static llm_arch llm_arch_from_string(const std::string & name) {
@ -412,6 +448,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
#elif GGML_USE_METAL
# define llama_host_malloc(n) ggml_metal_host_malloc(n)
# define llama_host_free(data) ggml_metal_host_free(data)
#elif GGML_USE_CPU_HBM
# define llama_host_malloc(n) hbw_malloc(n)
# define llama_host_free(data) if (data != NULL) hbw_free(data)
#else
# define llama_host_malloc(n) malloc(n)
# define llama_host_free(data) free(data)
@ -568,16 +607,16 @@ struct llama_mmap {
if (prefetch > 0) {
// Advise the kernel to preload the mapped memory
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
strerror(errno));
}
}
if (numa) {
// advise the kernel not to use readahead
// (because the next page might not belong on the same node)
if (madvise(addr, file->size, MADV_RANDOM)) {
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
strerror(errno));
}
}
@ -611,20 +650,25 @@ struct llama_mmap {
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
}
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
if (prefetch) {
// Advise the kernel to preload the mapped memory
WIN32_MEMORY_RANGE_ENTRY range;
range.VirtualAddress = addr;
range.NumberOfBytes = (SIZE_T)size;
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
// PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
// may fail on pre-Windows 8 systems
pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
if (pPrefetchVirtualMemory) {
// advise the kernel to preload the mapped memory
WIN32_MEMORY_RANGE_ENTRY range;
range.VirtualAddress = addr;
range.NumberOfBytes = (SIZE_T)size;
if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
}
}
#else
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
}
~llama_mmap() {
@ -1446,7 +1490,11 @@ struct llama_model_loader {
// allocate temp buffer if not using mmap
if (!use_mmap && cur->data == NULL) {
GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
cur->data = malloc(ggml_nbytes(cur));
#ifdef GGML_USE_CPU_HBM
cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
#else
cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
#endif
}
load_data_for(cur);
@ -1600,9 +1648,13 @@ static void llm_load_hparams(
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
}
}
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
// gpt-j n_rot = rotary_dim
}
// arch-specific KVs
@ -2919,7 +2971,12 @@ static bool llama_eval_internal(
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
// with the BLAS calls. need a better solution
if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
n_threads = std::min(4, n_threads);
}
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
@ -3031,33 +3088,10 @@ static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
}
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
}
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
}
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
}
static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(llama_is_control_token(vocab, id));
return id == vocab.special_bos_id;
}
static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) {
GGML_ASSERT(llama_is_control_token(vocab, id));
return id == vocab.special_eos_id;
}
static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id));
return id == vocab.special_pad_id;
}
static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(llama_is_byte_token(vocab, id));
const auto& token_data = vocab.id_to_token.at(id);
@ -3350,9 +3384,15 @@ struct llm_tokenizer_bpe {
std::string byte_str(1, *j);
auto token_multibyte = vocab.token_to_id.find(byte_str);
if (token_multibyte == vocab.token_to_id.end()) {
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
try {
llama_token token_byte = llama_byte_to_token(vocab, *j);
output.push_back(token_byte);
} catch (const std::out_of_range & err) {
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
}
} else {
output.push_back((*token_multibyte).second);
}
output.push_back((*token_multibyte).second);
}
} else {
output.push_back((*token).second);
@ -3626,7 +3666,7 @@ static void llama_grammar_advance_stack(
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
if (stack.empty()) {
new_stacks.push_back(stack);
new_stacks.emplace_back(stack);
return;
}
@ -3663,7 +3703,7 @@ static void llama_grammar_advance_stack(
}
case LLAMA_GRETYPE_CHAR:
case LLAMA_GRETYPE_CHAR_NOT:
new_stacks.push_back(stack);
new_stacks.emplace_back(stack);
break;
default:
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
@ -3828,6 +3868,25 @@ void llama_grammar_free(struct llama_grammar * grammar) {
delete grammar;
}
struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
// redirect elements in stacks to point to new rules
for (size_t is = 0; is < result->stacks.size(); is++) {
for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
result->stacks[is][ie] = &result->rules[ir0][ir1];
}
}
}
}
}
return result;
}
//
// sampling
//
@ -4419,7 +4478,7 @@ struct llama_logit_info {
}
return min_heap;
}
float probability_from_logit(float logit) {
float probability_from_logit(float logit) const {
return normalizer * std::exp(logit - max_l);
}
};
@ -4709,6 +4768,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
llm_load_arch(*ml, model);
llm_load_hparams(*ml, model, 0, 0, 0);
if (params->only_copy) {
ftype = model.ftype;
}
const size_t align = GGUF_DEFAULT_ALIGNMENT;
struct gguf_context * ctx_out = gguf_init_empty();
@ -4750,9 +4813,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
std::vector<std::thread> workers;
std::mutex mutex;
#ifdef GGML_USE_K_QUANTS
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
};
#endif
int idx = 0;
@ -4795,18 +4860,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// quantize only 2D tensors
quantize &= (tensor->n_dims == 2);
quantize &= params->quantize_output_tensor || name != "output.weight";
quantize &= quantized_type != tensor->type;
quantize &= !params->only_copy;
enum ggml_type new_type;
void * new_data;
size_t new_size;
if (!quantize) {
new_type = tensor->type;
new_data = tensor->data;
new_size = ggml_nbytes(tensor);
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
} else {
if (quantize) {
new_type = quantized_type;
#ifdef GGML_USE_K_QUANTS
// TODO: avoid hardcoded tensor names - use the TN_* constants
@ -4905,7 +4965,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}
}
#endif
// If we've decided to quantize to the same type the tensor is already
// in then there's nothing to do.
quantize = tensor->type != new_type;
}
if (!quantize) {
new_type = tensor->type;
new_data = tensor->data;
new_size = ggml_nbytes(tensor);
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
} else {
const size_t nelements = ggml_nelements(tensor);
float * f32_data;
@ -5310,7 +5379,7 @@ struct llama_context_params llama_context_default_params() {
/*.seed =*/ LLAMA_DEFAULT_SEED,
/*.n_ctx =*/ 512,
/*.n_batch =*/ 512,
/*.gpu_layers =*/ 0,
/*.n_gpu_layers =*/ 0,
/*.main_gpu =*/ 0,
/*.tensor_split =*/ nullptr,
/*.rope_freq_base =*/ 10000.0f,
@ -5318,7 +5387,7 @@ struct llama_context_params llama_context_default_params() {
/*.progress_callback =*/ nullptr,
/*.progress_callback_user_data =*/ nullptr,
/*.low_vram =*/ false,
/*.mul_mat_q =*/ false,
/*.mul_mat_q =*/ true,
/*.f16_kv =*/ true,
/*.logits_all =*/ false,
/*.vocab_only =*/ false,
@ -5327,6 +5396,10 @@ struct llama_context_params llama_context_default_params() {
/*.embedding =*/ false,
};
#ifdef GGML_USE_METAL
result.n_gpu_layers = 1;
#endif
return result;
}
@ -5336,6 +5409,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
/*.allow_requantize =*/ false,
/*.quantize_output_tensor =*/ true,
/*.only_copy =*/ false,
};
return result;
@ -5518,43 +5592,43 @@ struct llama_context * llama_new_context_with_model(
}
#endif
}
}
#ifdef GGML_USE_METAL
if (params.n_gpu_layers > 0) {
// this allocates all Metal resources and memory buffers
if (params.n_gpu_layers > 0) {
// this allocates all Metal resources and memory buffers
void * data_ptr = NULL;
size_t data_size = 0;
void * data_ptr = NULL;
size_t data_size = 0;
if (params.use_mmap) {
data_ptr = ctx->model.mapping->addr;
data_size = ctx->model.mapping->size;
} else {
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
data_size = ggml_get_mem_size (ctx->model.ctx);
}
if (params.use_mmap) {
data_ptr = ctx->model.mapping->addr;
data_size = ctx->model.mapping->size;
} else {
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
data_size = ggml_get_mem_size (ctx->model.ctx);
}
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
#define LLAMA_METAL_CHECK_BUF(result) \
if (!(result)) { \
LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
llama_free(ctx); \
return NULL; \
}
if (!(result)) { \
LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
llama_free(ctx); \
return NULL; \
}
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
#undef LLAMA_METAL_CHECK_BUF
}
}
#endif
}
#ifdef GGML_USE_MPI
ctx->ctx_mpi = ggml_mpi_init();
@ -5590,15 +5664,19 @@ void llama_free(struct llama_context * ctx) {
}
int llama_n_vocab(const struct llama_context * ctx) {
return ctx->model.vocab.id_to_token.size();
return llama_model_n_vocab(&ctx->model);
}
int llama_n_ctx(const struct llama_context * ctx) {
return ctx->model.hparams.n_ctx;
return llama_model_n_ctx(&ctx->model);
}
int llama_n_ctx_train(const struct llama_context * ctx) {
return llama_model_n_ctx_train(&ctx->model);
}
int llama_n_embd(const struct llama_context * ctx) {
return ctx->model.hparams.n_embd;
return llama_model_n_embd(&ctx->model);
}
enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
@ -5613,6 +5691,10 @@ int llama_model_n_ctx(const struct llama_model * model) {
return model->hparams.n_ctx;
}
int llama_model_n_ctx_train(const struct llama_model * model) {
return model->hparams.n_ctx_train;
}
int llama_model_n_embd(const struct llama_model * model) {
return model->hparams.n_embd;
}
@ -5888,7 +5970,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
rng_ss.str(std::string(&rng_buf[0], rng_size));
rng_ss >> ctx->rng;
GGML_ASSERT(rng_ss.fail() == false);
GGML_ASSERT(!rng_ss.fail());
}
// set logits

17
llama.h
View file

@ -164,6 +164,7 @@ extern "C" {
enum llama_ftype ftype; // quantize to this llama_ftype
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
} llama_model_quantize_params;
// grammar types
@ -244,15 +245,17 @@ extern "C" {
LLAMA_API bool llama_mmap_supported (void);
LLAMA_API bool llama_mlock_supported(void);
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
LLAMA_API int llama_n_vocab (const struct llama_context * ctx);
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
LLAMA_API int llama_model_n_vocab (const struct llama_model * model);
LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model);
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
// Get a string describing the model type
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
@ -409,6 +412,8 @@ extern "C" {
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
//
// Sampling functions
//

5
mypy.ini Normal file
View file

@ -0,0 +1,5 @@
[mypy]
strict = true
allow_untyped_calls = true
allow_untyped_defs = true
allow_incomplete_defs = true

View file

@ -76,7 +76,7 @@ void * align_with_offset(void * ptr, int offset) {
return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
}
void benchmark_function(size_t size, size_t q_size, int64_t iterations, std::function<size_t(void)> function) {
void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) {
int64_t min_time_us = INT64_MAX;
int64_t total_time_us = 0;
int64_t min_time_cycles = INT64_MAX;