fixed merge conflicts

This commit is contained in:
Eric Sommerlade 2023-09-12 10:53:34 +01:00
commit e072111d21
45 changed files with 2000 additions and 1483 deletions

View file

@ -3,6 +3,7 @@ Checks: >
bugprone-*, bugprone-*,
-bugprone-easily-swappable-parameters, -bugprone-easily-swappable-parameters,
-bugprone-implicit-widening-of-multiplication-result, -bugprone-implicit-widening-of-multiplication-result,
-bugprone-misplaced-widening-cast,
-bugprone-narrowing-conversions, -bugprone-narrowing-conversions,
readability-*, readability-*,
-readability-avoid-unconditional-preprocessor-if, -readability-avoid-unconditional-preprocessor-if,
@ -15,4 +16,8 @@ Checks: >
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*, performance-*,
portability-*, portability-*,
misc-*,
-misc-const-correctness,
-misc-non-private-member-variables-in-classes,
-misc-no-recursion,
FormatStyle: none FormatStyle: none

View file

@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
ARG CUDA_DOCKER_ARCH=all ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip apt-get install -y build-essential python3 python3-pip git
COPY requirements.txt requirements.txt COPY requirements.txt requirements.txt

View file

@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
ARG CUDA_DOCKER_ARCH=all ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential apt-get install -y build-essential git
WORKDIR /app WORKDIR /app

View file

@ -197,6 +197,62 @@ jobs:
cd build cd build
ctest --verbose --timeout 900 ctest --verbose --timeout 900
macOS-latest-cmake-ios:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
cmake -G Xcode .. \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
cmake --build . --config Release
macOS-latest-cmake-tvos:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
cmake -G Xcode .. \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=tvOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
cmake --build . --config Release
windows-latest-cmake: windows-latest-cmake:
runs-on: windows-latest runs-on: windows-latest

30
.gitignore vendored
View file

@ -31,27 +31,29 @@ tmp/
models/* models/*
models-mnt models-mnt
/main
/quantize
/quantize-stats
/result
/perplexity
/embedding
/train-text-from-scratch
/convert-llama2c-to-ggml
/simple
/benchmark-matmult
/vdot
/server
/Pipfile /Pipfile
/baby-llama
/beam-search
/benchmark-matmult
/convert-llama2c-to-ggml
/embd-input-test /embd-input-test
/embedding
/gguf /gguf
/gguf-llama-simple /gguf-llama-simple
/libllama.so /libllama.so
/llama-bench /llama-bench
/baby-llama /main
/beam-search /metal
/perplexity
/quantize
/quantize-stats
/result
/save-load-state /save-load-state
/server
/simple
/speculative
/train-text-from-scratch
/vdot
build-info.h build-info.h
arm_neon.h arm_neon.h
compile_commands.json compile_commands.json

View file

@ -36,6 +36,12 @@ endif()
# Option list # Option list
# #
if (APPLE)
set(LLAMA_METAL_DEFAULT ON)
else()
set(LLAMA_METAL_DEFAULT OFF)
endif()
# general # general
option(LLAMA_STATIC "llama: static link libraries" OFF) option(LLAMA_STATIC "llama: static link libraries" OFF)
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF) option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
@ -76,7 +82,8 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON) option(LLAMA_K_QUANTS "llama: use k-quants" ON)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
@ -158,6 +165,33 @@ if (APPLE AND LLAMA_ACCELERATE)
endif() endif()
endif() endif()
if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
message(STATUS "Metal framework found")
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
add_compile_definitions(GGML_USE_METAL)
if (LLAMA_METAL_NDEBUG)
add_compile_definitions(GGML_METAL_NDEBUG)
endif()
# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
)
endif()
if (LLAMA_BLAS) if (LLAMA_BLAS)
if (LLAMA_STATIC) if (LLAMA_STATIC)
set(BLA_STATIC ON) set(BLA_STATIC ON)
@ -293,29 +327,6 @@ if (LLAMA_CUBLAS)
endif() endif()
endif() endif()
if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
add_compile_definitions(GGML_USE_METAL)
#add_compile_definitions(GGML_METAL_NDEBUG)
# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
)
endif()
if (LLAMA_MPI) if (LLAMA_MPI)
cmake_minimum_required(VERSION 3.10) cmake_minimum_required(VERSION 3.10)
find_package(MPI) find_package(MPI)
@ -415,7 +426,7 @@ if (LLAMA_ALL_WARNINGS)
) )
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
# g++ only # g++ only
set(cxx_flags ${cxx_flags} -Wno-format-truncation) set(cxx_flags ${cxx_flags} -Wno-format-truncation -Wno-array-bounds)
endif() endif()
else() else()
# todo : msvc # todo : msvc
@ -472,7 +483,7 @@ if (NOT MSVC)
endif() endif()
endif() endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64") if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
message(STATUS "ARM detected") message(STATUS "ARM detected")
if (MSVC) if (MSVC)
add_compile_definitions(__ARM_NEON) add_compile_definitions(__ARM_NEON)
@ -551,12 +562,66 @@ else()
message(STATUS "Unknown architecture") message(STATUS "Unknown architecture")
endif() endif()
#
# POSIX conformance
#
# clock_gettime came in POSIX.1b (1993)
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
# posix_memalign came in POSIX.1-2001 / SUSv3
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
add_compile_definitions(_XOPEN_SOURCE=600)
# Somehow in OpenBSD whenever POSIX conformance is specified
# some string functions rely on locale_t availability,
# which was introduced in POSIX.1-2008, forcing us to go higher
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
remove_definitions(-D_XOPEN_SOURCE=600)
add_compile_definitions(_XOPEN_SOURCE=700)
endif()
# Data types, macros and functions related to controlling CPU affinity and
# some memory allocation are available on Linux through GNU extensions in libc
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
add_compile_definitions(_GNU_SOURCE)
endif()
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
# and on macOS its availability depends on enabling Darwin extensions
# similarly on DragonFly, enabling BSD extensions is necessary
if (
CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
CMAKE_SYSTEM_NAME MATCHES "iOS" OR
CMAKE_SYSTEM_NAME MATCHES "tvOS" OR
CMAKE_SYSTEM_NAME MATCHES "DragonFly"
)
add_compile_definitions(_DARWIN_C_SOURCE)
endif()
# alloca is a non-standard interface that is not visible on BSDs when
# POSIX conformance is specified, but not all of them provide a clean way
# to enable it in such cases
if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
add_compile_definitions(__BSD_VISIBLE)
endif()
if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
add_compile_definitions(_NETBSD_SOURCE)
endif()
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
add_compile_definitions(_BSD_SOURCE)
endif()
# #
# libraries # libraries
# #
# ggml # ggml
if (GGML_USE_CPU_HBM)
add_definitions(-DGGML_USE_CPU_HBM)
find_library(memkind memkind REQUIRED)
endif()
add_library(ggml OBJECT add_library(ggml OBJECT
ggml.c ggml.c
ggml.h ggml.h
@ -572,6 +637,9 @@ add_library(ggml OBJECT
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
target_compile_features(ggml PUBLIC c_std_11) # don't bump target_compile_features(ggml PUBLIC c_std_11) # don't bump
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
if (GGML_USE_CPU_HBM)
target_link_libraries(ggml PUBLIC memkind)
endif()
add_library(ggml_static STATIC $<TARGET_OBJECTS:ggml>) add_library(ggml_static STATIC $<TARGET_OBJECTS:ggml>)
if (BUILD_SHARED_LIBS) if (BUILD_SHARED_LIBS)

169
Makefile
View file

@ -1,5 +1,5 @@
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search tests/test-c.o BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
# Binaries only useful for tests # Binaries only useful for tests
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1
@ -7,11 +7,44 @@ TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-dou
# Code coverage output files # Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
ifndef UNAME_S
UNAME_S := $(shell uname -s)
endif
ifndef UNAME_P
UNAME_P := $(shell uname -p)
endif
ifndef UNAME_M
UNAME_M := $(shell uname -m)
endif
# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin)
ifndef LLAMA_NO_METAL
LLAMA_METAL := 1
endif
ifneq ($(UNAME_P),arm)
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
ifeq ($(SYSCTL_M),1)
# UNAME_P := arm
# UNAME_M := arm64
warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
endif
endif
endif
ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
BUILD_TARGETS += metal
endif
default: $(BUILD_TARGETS) default: $(BUILD_TARGETS)
test: test: $(TEST_TARGETS)
@echo "Running tests..." @failures=0; \
@for test_target in $(TEST_TARGETS); do \ for test_target in $(TEST_TARGETS); do \
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \ if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \ elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
@ -19,10 +52,21 @@ test:
elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \ elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \
continue; \ continue; \
else \ else \
echo "Running test $$test_target..."; \
./$$test_target; \ ./$$test_target; \
fi; \ fi; \
done if [ $$? -ne 0 ]; then \
@echo "All tests have been run." printf 'Test $$test_target FAILED!\n\n' $$test_target; \
failures=$$(( failures + 1 )); \
else \
printf 'Test %s passed.\n\n' $$test_target; \
fi; \
done; \
if [ $$failures -gt 0 ]; then \
printf '\n%s tests failed.\n' $$failures; \
exit 1; \
fi
@echo 'All tests passed.'
all: $(BUILD_TARGETS) $(TEST_TARGETS) all: $(BUILD_TARGETS) $(TEST_TARGETS)
@ -38,18 +82,6 @@ gcovr-report: coverage ## Generate gcovr report
mkdir -p gcovr-report mkdir -p gcovr-report
gcovr --root . --html --html-details --output gcovr-report/coverage.html gcovr --root . --html --html-details --output gcovr-report/coverage.html
ifndef UNAME_S
UNAME_S := $(shell uname -s)
endif
ifndef UNAME_P
UNAME_P := $(shell uname -p)
endif
ifndef UNAME_M
UNAME_M := $(shell uname -m)
endif
ifdef RISCV_CROSS_COMPILE ifdef RISCV_CROSS_COMPILE
CC := riscv64-unknown-linux-gnu-gcc CC := riscv64-unknown-linux-gnu-gcc
CXX := riscv64-unknown-linux-gnu-g++ CXX := riscv64-unknown-linux-gnu-g++
@ -58,19 +90,6 @@ endif
CCV := $(shell $(CC) --version | head -n 1) CCV := $(shell $(CC) --version | head -n 1)
CXXV := $(shell $(CXX) --version | head -n 1) CXXV := $(shell $(CXX) --version | head -n 1)
# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin)
ifneq ($(UNAME_P),arm)
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
ifeq ($(SYSCTL_M),1)
# UNAME_P := arm
# UNAME_M := arm64
warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
endif
endif
endif
# #
# Compile flags # Compile flags
# #
@ -83,10 +102,60 @@ else
OPT = -O3 OPT = -O3
endif endif
MK_CPPFLAGS = -I. -Icommon MK_CPPFLAGS = -I. -Icommon
MK_CFLAGS = $(CPPFLAGS) $(OPT) -std=c11 -fPIC MK_CFLAGS = $(OPT) -std=c11 -fPIC
MK_CXXFLAGS = $(CPPFLAGS) $(OPT) -std=c++11 -fPIC MK_CXXFLAGS = $(OPT) -std=c++11 -fPIC
MK_LDFLAGS = MK_LDFLAGS =
# clock_gettime came in POSIX.1b (1993)
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
# posix_memalign came in POSIX.1-2001 / SUSv3
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
MK_CFLAGS += -D_XOPEN_SOURCE=600
MK_CXXFLAGS += -D_XOPEN_SOURCE=600
# Somehow in OpenBSD whenever POSIX conformance is specified
# some string functions rely on locale_t availability,
# which was introduced in POSIX.1-2008, forcing us to go higher
ifeq ($(UNAME_S),OpenBSD)
MK_CFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
MK_CXXFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
endif
# Data types, macros and functions related to controlling CPU affinity and
# some memory allocation are available on Linux through GNU extensions in libc
ifeq ($(UNAME_S),Linux)
MK_CFLAGS += -D_GNU_SOURCE
MK_CXXFLAGS += -D_GNU_SOURCE
endif
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
# and on macOS its availability depends on enabling Darwin extensions
# similarly on DragonFly, enabling BSD extensions is necessary
ifeq ($(UNAME_S),Darwin)
MK_CFLAGS += -D_DARWIN_C_SOURCE
MK_CXXFLAGS += -D_DARWIN_C_SOURCE
endif
ifeq ($(UNAME_S),DragonFly)
MK_CFLAGS += -D__BSD_VISIBLE
MK_CXXFLAGS += -D__BSD_VISIBLE
endif
# alloca is a non-standard interface that is not visible on BSDs when
# POSIX conformance is specified, but not all of them provide a clean way
# to enable it in such cases
ifeq ($(UNAME_S),FreeBSD)
MK_CFLAGS += -D__BSD_VISIBLE
MK_CXXFLAGS += -D__BSD_VISIBLE
endif
ifeq ($(UNAME_S),NetBSD)
MK_CFLAGS += -D_NETBSD_SOURCE
MK_CXXFLAGS += -D_NETBSD_SOURCE
endif
ifeq ($(UNAME_S),OpenBSD)
MK_CFLAGS += -D_BSD_SOURCE
MK_CXXFLAGS += -D_BSD_SOURCE
endif
ifdef LLAMA_DEBUG ifdef LLAMA_DEBUG
MK_CFLAGS += -O0 -g MK_CFLAGS += -O0 -g
MK_CXXFLAGS += -O0 -g MK_CXXFLAGS += -O0 -g
@ -101,12 +170,11 @@ endif
ifdef LLAMA_CODE_COVERAGE ifdef LLAMA_CODE_COVERAGE
CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase '' MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
endif endif
ifdef LLAMA_DISABLE_LOGS ifdef LLAMA_DISABLE_LOGS
CFLAGS += -DLOG_DISABLE_LOGS MK_CPPFLAGS += -DLOG_DISABLE_LOGS
CXXFLAGS += -DLOG_DISABLE_LOGS
endif # LLAMA_DISABLE_LOGS endif # LLAMA_DISABLE_LOGS
# warnings # warnings
@ -116,7 +184,7 @@ MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-m
ifeq '' '$(findstring clang++,$(CXX))' ifeq '' '$(findstring clang++,$(CXX))'
# g++ only # g++ only
CXXFLAGS += -Wno-format-truncation MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
endif endif
# OS specific # OS specific
@ -180,8 +248,8 @@ endif
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
# https://github.com/ggerganov/llama.cpp/issues/2922 # https://github.com/ggerganov/llama.cpp/issues/2922
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))' ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
CFLAGS += -Xassembler -muse-unaligned-vector-move MK_CFLAGS += -Xassembler -muse-unaligned-vector-move
CXXFLAGS += -Xassembler -muse-unaligned-vector-move MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
endif endif
ifneq ($(filter aarch64%,$(UNAME_M)),) ifneq ($(filter aarch64%,$(UNAME_M)),)
@ -218,8 +286,8 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
endif endif
else else
CFLAGS += -march=rv64gcv -mabi=lp64d MK_CFLAGS += -march=rv64gcv -mabi=lp64d
CXXFLAGS += -march=rv64gcv -mabi=lp64d MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
endif endif
ifndef LLAMA_NO_K_QUANTS ifndef LLAMA_NO_K_QUANTS
@ -231,8 +299,8 @@ endif
endif endif
ifndef LLAMA_NO_ACCELERATE ifndef LLAMA_NO_ACCELERATE
# Mac M1 - include Accelerate framework. # Mac OS - include Accelerate framework.
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time). # `-framework Accelerate` works both with Apple Silicon and Mac Intel
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
MK_CPPFLAGS += -DGGML_USE_ACCELERATE MK_CPPFLAGS += -DGGML_USE_ACCELERATE
MK_LDFLAGS += -framework Accelerate MK_LDFLAGS += -framework Accelerate
@ -350,9 +418,12 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
endif # LLAMA_HIPBLAS endif # LLAMA_HIPBLAS
ifdef LLAMA_METAL ifdef LLAMA_METAL
MK_CPPFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG MK_CPPFLAGS += -DGGML_USE_METAL
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
OBJS += ggml-metal.o OBJS += ggml-metal.o
ifdef LLAMA_METAL_NDEBUG
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
endif
endif # LLAMA_METAL endif # LLAMA_METAL
ifdef LLAMA_METAL ifdef LLAMA_METAL
@ -371,9 +442,8 @@ k_quants.o: k_quants.c k_quants.h
endif # LLAMA_NO_K_QUANTS endif # LLAMA_NO_K_QUANTS
# combine build flags with cmdline overrides # combine build flags with cmdline overrides
override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
override CFLAGS := $(MK_CFLAGS) $(CFLAGS) override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
override CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
# #
@ -477,9 +547,8 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS)
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS) beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))' speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
BUILD_TARGETS += metal $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
endif
ifdef LLAMA_METAL ifdef LLAMA_METAL
metal: examples/metal/metal.cpp ggml.o $(OBJS) metal: examples/metal/metal.cpp ggml.o $(OBJS)

View file

@ -2,8 +2,30 @@
import PackageDescription import PackageDescription
#if arch(arm) || arch(arm64)
let platforms: [SupportedPlatform]? = [
.macOS(.v11),
.iOS(.v14),
.watchOS(.v4),
.tvOS(.v14)
]
let exclude: [String] = []
let additionalSources: [String] = ["ggml-metal.m"]
let additionalSettings: [CSetting] = [
.unsafeFlags(["-fno-objc-arc"]),
.define("GGML_SWIFT"),
.define("GGML_USE_METAL")
]
#else
let platforms: [SupportedPlatform]? = nil
let exclude: [String] = ["ggml-metal.metal"]
let additionalSources: [String] = []
let additionalSettings: [CSetting] = []
#endif
let package = Package( let package = Package(
name: "llama", name: "llama",
platforms: platforms,
products: [ products: [
.library(name: "llama", targets: ["llama"]), .library(name: "llama", targets: ["llama"]),
], ],
@ -11,23 +33,23 @@ let package = Package(
.target( .target(
name: "llama", name: "llama",
path: ".", path: ".",
exclude: ["ggml-metal.metal"], exclude: exclude,
sources: [ sources: [
"ggml.c", "ggml.c",
"llama.cpp", "llama.cpp",
"ggml-alloc.c", "ggml-alloc.c",
"k_quants.c" "k_quants.c",
], ] + additionalSources,
publicHeadersPath: "spm-headers", publicHeadersPath: "spm-headers",
cSettings: [ cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32"]), .unsafeFlags(["-Wno-shorten-64-to-32"]),
.define("GGML_USE_K_QUANTS"), .define("GGML_USE_K_QUANTS"),
.define("GGML_USE_ACCELERATE") .define("GGML_USE_ACCELERATE")
], ] + additionalSettings,
linkerSettings: [ linkerSettings: [
.linkedFramework("Accelerate") .linkedFramework("Accelerate")
] ]
), )
], ],
cxxLanguageStandard: .cxx11 cxxLanguageStandard: .cxx11
) )

View file

@ -11,21 +11,9 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
### Hot topics ### Hot topics
- #### IMPORTANT: Tokenizer fixes and API change (developers and projects using `llama.cpp` built-in tokenization must read): https://github.com/ggerganov/llama.cpp/pull/2810 - Local Falcon 180B inference on Mac Studio
- GGUFv2 adds support for 64-bit sizes + backwards compatible: https://github.com/ggerganov/llama.cpp/pull/2821 https://github.com/ggerganov/llama.cpp/assets/1991296/98abd4e8-7077-464c-ae89-aebabca7757e
- Added support for Falcon models: https://github.com/ggerganov/llama.cpp/pull/2717
- A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398)
Last revision compatible with the old format: [dadbed9](https://github.com/ggerganov/llama.cpp/commit/dadbed99e65252d79f81101a392d0d6497b86caa)
### Current `master` should be considered in Beta - expect some issues for a few days!
### Be prepared to re-convert and / or re-quantize your GGUF models while this notice is up!
### Issues with non-GGUF models will be considered with low priority!
---- ----
@ -280,29 +268,11 @@ In order to build llama.cpp you have three different options.
### Metal Build ### Metal Build
Using Metal allows the computation to be executed on the GPU for Apple devices: On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
- Using `make`: When built with Metal support, you can explicitly disable GPU inference with the `--gpu-layers|-ngl 0` command-line
argument.
```bash
LLAMA_METAL=1 make
```
- Using `CMake`:
```bash
mkdir build-metal
cd build-metal
cmake -DLLAMA_METAL=ON ..
cmake --build . --config Release
```
When built with Metal support, you can enable GPU inference with the `--gpu-layers|-ngl` command-line argument.
Any value larger than 0 will offload the computation to the GPU. For example:
```bash
./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1
```
### MPI Build ### MPI Build
@ -431,7 +401,7 @@ Building the program with BLAS support may lead to some performance improvements
- #### hipBLAS - #### hipBLAS
This provide BLAS acceleation on HIP supported GPU like AMD GPU. This provides BLAS acceleration on HIP-supported AMD GPUs.
Make sure to have ROCm installed. Make sure to have ROCm installed.
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html). You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
Windows support is coming soon... Windows support is coming soon...
@ -755,12 +725,12 @@ python3 convert.py pygmalion-7b/ --outtype q4_1
- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data. - Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
- Alternatively, if you want to save time and space, you can download already converted and quantized models from [TheBloke](https://huggingface.co/TheBloke), including: - Alternatively, if you want to save time and space, you can download already converted and quantized models from [TheBloke](https://huggingface.co/TheBloke), including:
- [LLaMA 2 7B base](https://huggingface.co/TheBloke/Llama-2-7B-GGML) - [LLaMA 2 7B base](https://huggingface.co/TheBloke/Llama-2-7B-GGUF)
- [LLaMA 2 13B base](https://huggingface.co/TheBloke/Llama-2-13B-GGML) - [LLaMA 2 13B base](https://huggingface.co/TheBloke/Llama-2-13B-GGUF)
- [LLaMA 2 70B base](https://huggingface.co/TheBloke/Llama-2-70B-GGML) - [LLaMA 2 70B base](https://huggingface.co/TheBloke/Llama-2-70B-GGUF)
- [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML) - [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGUF)
- [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML) - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF)
- [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGML) - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGUF)
### Verifying the model files ### Verifying the model files

View file

@ -57,7 +57,7 @@ int32_t get_num_physical_cores() {
siblings.insert(line); siblings.insert(line);
} }
} }
if (siblings.size() > 0) { if (!siblings.empty()) {
return static_cast<int32_t>(siblings.size()); return static_cast<int32_t>(siblings.size());
} }
#elif defined(__APPLE__) && defined(__MACH__) #elif defined(__APPLE__) && defined(__MACH__)
@ -584,109 +584,109 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
} }
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stdout, "usage: %s [options]\n", argv[0]); printf("usage: %s [options]\n", argv[0]);
fprintf(stdout, "\n"); printf("\n");
fprintf(stdout, "options:\n"); printf("options:\n");
fprintf(stdout, " -h, --help show this help message and exit\n"); printf(" -h, --help show this help message and exit\n");
fprintf(stdout, " -i, --interactive run in interactive mode\n"); printf(" -i, --interactive run in interactive mode\n");
fprintf(stdout, " --interactive-first run in interactive mode and wait for input right away\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n");
fprintf(stdout, " -ins, --instruct run in instruction mode (use with Alpaca models)\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
fprintf(stdout, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
fprintf(stdout, " -r PROMPT, --reverse-prompt PROMPT\n"); printf(" -r PROMPT, --reverse-prompt PROMPT\n");
fprintf(stdout, " halt generation at PROMPT, return control in interactive mode\n"); printf(" halt generation at PROMPT, return control in interactive mode\n");
fprintf(stdout, " (can be specified more than once for multiple prompts).\n"); printf(" (can be specified more than once for multiple prompts).\n");
fprintf(stdout, " --color colorise output to distinguish prompt and user input from generations\n"); printf(" --color colorise output to distinguish prompt and user input from generations\n");
fprintf(stdout, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stdout, " -p PROMPT, --prompt PROMPT\n"); printf(" -p PROMPT, --prompt PROMPT\n");
fprintf(stdout, " prompt to start generation with (default: empty)\n"); printf(" prompt to start generation with (default: empty)\n");
fprintf(stdout, " -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
fprintf(stdout, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); printf(" --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
fprintf(stdout, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); printf(" --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
fprintf(stdout, " not supported with --interactive or other interactive options\n"); printf(" not supported with --interactive or other interactive options\n");
fprintf(stdout, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n"); printf(" --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
fprintf(stdout, " --random-prompt start with a randomized prompt.\n"); printf(" --random-prompt start with a randomized prompt.\n");
fprintf(stdout, " --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n"); printf(" --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n");
fprintf(stdout, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); printf(" --in-prefix STRING string to prefix user inputs with (default: empty)\n");
fprintf(stdout, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
fprintf(stdout, " -f FNAME, --file FNAME\n"); printf(" -f FNAME, --file FNAME\n");
fprintf(stdout, " prompt file to start generation.\n"); printf(" prompt file to start generation.\n");
fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
fprintf(stdout, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p); printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
fprintf(stdout, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
fprintf(stdout, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
fprintf(stdout, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
fprintf(stdout, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
fprintf(stdout, " --mirostat N use Mirostat sampling.\n"); printf(" --mirostat N use Mirostat sampling.\n");
fprintf(stdout, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
fprintf(stdout, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
fprintf(stdout, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
fprintf(stdout, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
fprintf(stdout, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
fprintf(stdout, " modifies the likelihood of token appearing in the completion,\n"); printf(" modifies the likelihood of token appearing in the completion,\n");
fprintf(stdout, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
fprintf(stdout, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
fprintf(stdout, " --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n"); printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
fprintf(stdout, " --grammar-file FNAME file to read grammar from\n"); printf(" --grammar-file FNAME file to read grammar from\n");
fprintf(stdout, " --cfg-negative-prompt PROMPT\n"); printf(" --cfg-negative-prompt PROMPT\n");
fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n"); printf(" negative prompt to use for guidance. (default: empty)\n");
fprintf(stdout, " --cfg-negative-prompt-file FNAME\n"); printf(" --cfg-negative-prompt-file FNAME\n");
fprintf(stdout, " negative prompt file to use for guidance. (default: empty)\n"); printf(" negative prompt file to use for guidance. (default: empty)\n");
fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
fprintf(stdout, " --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale); printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
fprintf(stdout, " --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base); printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
fprintf(stdout, " --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale); printf(" --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
fprintf(stdout, " --no-penalize-nl do not penalize newline token\n"); printf(" --no-penalize-nl do not penalize newline token\n");
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp); printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n"); printf(" --perplexity compute perplexity over each ctx window of the prompt\n");
fprintf(stdout, " --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
fprintf(stdout, " --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
fprintf(stdout, " --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
fprintf(stdout, " --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
if (llama_mlock_supported()) { if (llama_mlock_supported()) {
fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
} }
if (llama_mmap_supported()) { if (llama_mmap_supported()) {
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
} }
fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n"); printf(" --numa attempt optimizations that help on some NUMA systems\n");
fprintf(stdout, " if run without this previously, it is recommended to drop the system page cache before using this\n"); printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
fprintf(stdout, " see https://github.com/ggerganov/llama.cpp/issues/1437\n"); printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stdout, " -ngl N, --n-gpu-layers N\n"); printf(" -ngl N, --n-gpu-layers N\n");
fprintf(stdout, " number of layers to store in VRAM\n"); printf(" number of layers to store in VRAM\n");
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n"); printf(" -ts SPLIT --tensor-split SPLIT\n");
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n");
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
fprintf(stdout, " -nommq, --no-mul-mat-q\n"); printf(" -nommq, --no-mul-mat-q\n");
fprintf(stdout, " use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n"); printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n"); printf(" Not recommended since this is both slower and uses more VRAM.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
#endif #endif
fprintf(stdout, " --mtest compute maximum memory usage\n"); printf(" --mtest compute maximum memory usage\n");
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n"); printf(" --export export the computation graph to 'llama.ggml'\n");
fprintf(stdout, " --verbose-prompt print prompt before generation\n"); printf(" --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stdout, " -m FNAME, --model FNAME\n"); printf(" -m FNAME, --model FNAME\n");
fprintf(stdout, " model path (default: %s)\n", params.model.c_str()); printf(" model path (default: %s)\n", params.model.c_str());
fprintf(stdout, " -md FNAME, --model-draft FNAME\n"); printf(" -md FNAME, --model-draft FNAME\n");
fprintf(stdout, " draft model for speculative decoding (default: %s)\n", params.model.c_str()); printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
fprintf(stdout, " -ld LOGDIR, --logdir LOGDIR\n"); printf(" -ld LOGDIR, --logdir LOGDIR\n");
fprintf(stdout, " path under which to save YAML logs (no logging if unset)\n"); printf(" path under which to save YAML logs (no logging if unset)\n");
fprintf(stdout, "\n"); printf("\n");
} }
std::string gpt_random_prompt(std::mt19937 & rng) { std::string gpt_random_prompt(std::mt19937 & rng) {
@ -717,7 +717,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
lparams.n_ctx = params.n_ctx; lparams.n_ctx = params.n_ctx;
lparams.n_batch = params.n_batch; lparams.n_batch = params.n_batch;
lparams.n_gpu_layers = params.n_gpu_layers; if (params.n_gpu_layers != -1) {
lparams.n_gpu_layers = params.n_gpu_layers;
}
lparams.main_gpu = params.main_gpu; lparams.main_gpu = params.main_gpu;
lparams.tensor_split = params.tensor_split; lparams.tensor_split = params.tensor_split;
lparams.low_vram = params.low_vram; lparams.low_vram = params.low_vram;
@ -770,8 +772,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
{ {
LOG("warming up the model with an empty run\n"); LOG("warming up the model with an empty run\n");
const std::vector<llama_token> tmp = { llama_token_bos(lctx), }; const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
llama_eval(lctx, tmp.data(), tmp.size(), 0, params.n_threads); llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads);
llama_reset_timings(lctx); llama_reset_timings(lctx);
} }
@ -1212,7 +1214,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false"); fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
fprintf(stream, "n_gpu_layers: %d # default: 0\n", params.n_gpu_layers); fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs); fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");

View file

@ -20,6 +20,9 @@
#define DIRECTORY_SEPARATOR '/' #define DIRECTORY_SEPARATOR '/'
#endif // _WIN32 #endif // _WIN32
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", ##__VA_ARGS__); exit(1); } while (0)
// //
// CLI argument parsing // CLI argument parsing
// //
@ -34,7 +37,7 @@ struct gpt_params {
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_gpu_layers = 0; // number of layers to store in VRAM int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.

View file

@ -415,6 +415,7 @@ namespace grammar_parser {
std::vector<const llama_grammar_element *> parse_state::c_rules() { std::vector<const llama_grammar_element *> parse_state::c_rules() {
std::vector<const llama_grammar_element *> ret; std::vector<const llama_grammar_element *> ret;
ret.reserve(rules.size());
for (const auto & rule : rules) { for (const auto & rule : rules) {
ret.push_back(rule.data()); ret.push_back(rule.data());
} }

View file

@ -513,16 +513,16 @@ inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string &
inline void log_print_usage() inline void log_print_usage()
{ {
fprintf(stdout, "log options:\n"); printf("log options:\n");
/* format /* format
fprintf(stdout, " -h, --help show this help message and exit\n");*/ printf(" -h, --help show this help message and exit\n");*/
/* spacing /* spacing
fprintf(stdout, "__-param----------------Description\n");*/ printf("__-param----------------Description\n");*/
fprintf(stdout, " --log-test Run simple logging test\n"); printf(" --log-test Run simple logging test\n");
fprintf(stdout, " --log-disable Disable trace logs\n"); printf(" --log-disable Disable trace logs\n");
fprintf(stdout, " --log-enable Enable trace logs\n"); printf(" --log-enable Enable trace logs\n");
fprintf(stdout, " --log-file Specify a log filename (without extension)\n"); printf(" --log-file Specify a log filename (without extension)\n");
fprintf(stdout, " Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */ printf(" Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */
} }
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv) #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)

View file

@ -55,10 +55,10 @@ def count_model_parts(dir_model: Path) -> int:
def parse_args() -> argparse.Namespace: def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file") parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1) parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
return parser.parse_args() return parser.parse_args()
args = parse_args() args = parse_args()

View file

@ -5,6 +5,7 @@ import argparse
import math import math
import struct import struct
import sys import sys
from enum import IntEnum
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
@ -34,10 +35,35 @@ GGML_QUANT_SIZES = {
gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8), gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
} }
class GGMLFormat(IntEnum):
GGML = 0
GGMF = 1
GGJT = 2
class GGMLFType(IntEnum):
ALL_F32 = 0
MOSTLY_F16 = 1
MOSTLY_Q4_0 = 2
MOSTLY_Q4_1 = 3
MOSTLY_Q4_1_SOME_F16 = 4
MOSTLY_Q8_0 = 7
MOSTLY_Q5_0 = 8
MOSTLY_Q5_1 = 9
MOSTLY_Q2_K = 10
MOSTLY_Q3_K_S = 11
MOSTLY_Q3_K_M = 12
MOSTLY_Q3_K_L = 13
MOSTLY_Q4_K_S = 14
MOSTLY_Q4_K_M = 15
MOSTLY_Q5_K_S = 16
MOSTLY_Q5_K_M = 17
MOSTLY_Q6_K = 18
class Hyperparameters: class Hyperparameters:
def __init__(self): def __init__(self):
self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0 self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
self.n_ff = 0 self.n_layer = self.n_rot = self.n_ff = 0
self.ftype = GGMLFType.ALL_F32
def set_n_ff(self, model): def set_n_ff(self, model):
ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight') ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
@ -53,16 +79,21 @@ class Hyperparameters:
self.n_head, self.n_head,
self.n_layer, self.n_layer,
self.n_rot, self.n_rot,
self.ftype, ftype,
) = struct.unpack('<7I', data[offset:offset + (4 * 7)]) ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
try:
self.ftype = GGMLFType(ftype)
except ValueError:
raise ValueError(f'Invalid ftype {ftype}')
return 4 * 7 return 4 * 7
def __str__(self): def __str__(self):
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>' return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
class Vocab: class Vocab:
def __init__(self): def __init__(self, load_scores = True):
self.items = [] self.items = []
self.load_scores = load_scores
def load(self, data, offset, n_vocab): def load(self, data, offset, n_vocab):
orig_offset = offset orig_offset = offset
@ -70,20 +101,24 @@ class Vocab:
itemlen = struct.unpack('<I', data[offset:offset + 4])[0] itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
assert itemlen < 4096, 'Absurd vocab item length' assert itemlen < 4096, 'Absurd vocab item length'
offset += 4 offset += 4
vocab = bytes(data[offset:offset + itemlen]) item_text = bytes(data[offset:offset + itemlen])
offset += itemlen offset += itemlen
score = struct.unpack('<f', data[offset:offset + 4])[0] if self.load_scores:
offset += 4 item_score = struct.unpack('<f', data[offset:offset + 4])[0]
self.items.append((vocab, score)) offset += 4
else:
item_score = 0.0
self.items.append((item_text, item_score))
return offset - orig_offset return offset - orig_offset
class Tensor: class Tensor:
def __init__(self): def __init__(self, use_padding = True):
self.name = None self.name = None
self.dims: tuple[int, ...] = () self.dims: tuple[int, ...] = ()
self.dtype = None self.dtype = None
self.start_offset = 0 self.start_offset = 0
self.len_bytes = np.int64(0) self.len_bytes = np.int64(0)
self.use_padding = use_padding
def load(self, data, offset): def load(self, data, offset):
orig_offset = offset orig_offset = offset
@ -99,7 +134,7 @@ class Tensor:
offset += 4 * n_dims offset += 4 * n_dims
self.name = bytes(data[offset:offset + name_len]) self.name = bytes(data[offset:offset + name_len])
offset += name_len offset += name_len
pad = ((offset + 31) & ~31) - offset pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
offset += pad offset += pad
n_elems = np.prod(self.dims) n_elems = np.prod(self.dims)
n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize) n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
@ -109,7 +144,7 @@ class Tensor:
# print(n_dims, name_len, dtype, self.dims, self.name, pad) # print(n_dims, name_len, dtype, self.dims, self.name, pad)
return offset - orig_offset return offset - orig_offset
class GGMLV3Model: class GGMLModel:
def __init__(self): def __init__(self):
self.hyperparameters = None self.hyperparameters = None
self.vocab = None self.vocab = None
@ -117,20 +152,52 @@ class GGMLV3Model:
self.tensors = [] self.tensors = []
def validate_header(self, data, offset): def validate_header(self, data, offset):
if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3: magic = bytes(data[offset:offset + 4])
raise ValueError('Only GGJTv3 supported') if magic == b'GGUF':
return 8 raise ValueError('File is already in GGUF format.')
if magic == b'lmgg':
self.file_format = GGMLFormat.GGML
self.format_version = 1
return 4
version = struct.unpack('<I', data[offset + 4:offset + 8])[0]
if magic == b'fmgg':
if version != 1:
raise ValueError(f'Cannot handle unexpected GGMF file version {version}')
self.file_format = GGMLFormat.GGMF
self.format_version = version
return 8
if magic == b'tjgg':
if version < 1 or version > 3:
raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
self.file_format = GGMLFormat.GGJT
self.format_version = version
return 8
raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
def validate_conversion(self, ftype):
err = ''
if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
if len(err) > 0:
raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
def load(self, data, offset): def load(self, data, offset):
offset += self.validate_header(data, offset) offset += self.validate_header(data, offset)
hp = Hyperparameters() hp = Hyperparameters()
offset += hp.load(data, offset) offset += hp.load(data, offset)
vocab = Vocab() print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
self.validate_conversion(hp.ftype)
vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
offset += vocab.load(data, offset, hp.n_vocab) offset += vocab.load(data, offset, hp.n_vocab)
tensors: list[Tensor] = [] tensors: list[Tensor] = []
tensor_map = {} tensor_map = {}
while offset < len(data): while offset < len(data):
tensor = Tensor() tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
offset += tensor.load(data, offset) offset += tensor.load(data, offset)
tensor_map[tensor.name] = len(tensors) tensor_map[tensor.name] = len(tensors)
tensors.append(tensor) tensors.append(tensor)
@ -168,7 +235,10 @@ class GGMLToGGUF:
def save(self): def save(self):
print('* Preparing to save GGUF file') print('* Preparing to save GGUF file')
gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False) gguf_writer = gguf.GGUFWriter(
self.cfg.output,
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
use_temp_file = False )
self.add_params(gguf_writer) self.add_params(gguf_writer)
self.add_vocab(gguf_writer) self.add_vocab(gguf_writer)
if self.special_vocab is not None: if self.special_vocab is not None:
@ -185,7 +255,10 @@ class GGMLToGGUF:
def add_params(self, gguf_writer): def add_params(self, gguf_writer):
hp = self.model.hyperparameters hp = self.model.hyperparameters
cfg = self.cfg cfg = self.cfg
desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format' if cfg.desc is not None:
desc = cfg.desc
else:
desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
try: try:
# Filenames aren't necessarily valid UTF8. # Filenames aren't necessarily valid UTF8.
name = cfg.name if cfg.name is not None else cfg.input.name name = cfg.name if cfg.name is not None else cfg.input.name
@ -195,6 +268,7 @@ class GGMLToGGUF:
if name is not None: if name is not None:
gguf_writer.add_name(name) gguf_writer.add_name(name)
gguf_writer.add_description(desc) gguf_writer.add_description(desc)
gguf_writer.add_file_type(int(hp.ftype))
if self.params_override is not None: if self.params_override is not None:
po = self.params_override po = self.params_override
assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch' assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
@ -231,7 +305,8 @@ class GGMLToGGUF:
tokens.append(vbytes) tokens.append(vbytes)
scores.append(score) scores.append(score)
toktypes.append(ttype) toktypes.append(ttype)
assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}' assert len(tokens) == hp.n_vocab, \
f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
gguf_writer.add_token_list(tokens) gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores) gguf_writer.add_token_scores(scores)
if len(toktypes) > 0: if len(toktypes) > 0:
@ -283,7 +358,11 @@ class GGMLToGGUF:
tempdims[1] = tempdims[0] tempdims[1] = tempdims[0]
tempdims[0] = temp tempdims[0] = temp
# print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}') # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype) gguf_writer.add_tensor(
mapped_name,
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
raw_shape = tempdims,
raw_dtype = tensor.dtype )
def handle_metadata(cfg, hp): def handle_metadata(cfg, hp):
import convert import convert
@ -305,32 +384,46 @@ def handle_metadata(cfg, hp):
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path) params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
else: else:
raise ValueError('Unable to load metadata') raise ValueError('Unable to load metadata')
vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype) vocab = convert.load_vocab(
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
cfg.vocabtype )
# FIXME: Respect cfg.vocab_dir? # FIXME: Respect cfg.vocab_dir?
svocab = gguf.SpecialVocab(cfg.model_metadata_dir) svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
convert.check_vocab_size(params, vocab) convert.check_vocab_size(params, vocab)
return (params, vocab, svocab) return (params, vocab, svocab)
def handle_args(): def handle_args():
parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF') parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
parser.add_argument('--input', '-i', type = Path, required = True, help = 'Input GGMLv3 filename') parser.add_argument('--input', '-i', type = Path, required = True,
parser.add_argument('--output', '-o', type = Path, required = True, help ='Output GGUF filename') help = 'Input GGMLv3 filename')
parser.add_argument('--name', help = 'Set model name') parser.add_argument('--output', '-o', type = Path, required = True,
parser.add_argument('--desc', help = 'Set model description') help ='Output GGUF filename')
parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)') parser.add_argument('--name',
parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2') help = 'Set model name')
parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096') parser.add_argument('--desc',
parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory') help = 'Set model description')
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") parser.add_argument('--gqa', type = int, default = 1,
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm") help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
parser.add_argument('--eps', default = '5.0e-06',
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
parser.add_argument('--context-length', '-c', type=int, default = 2048,
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
parser.add_argument('--model-metadata-dir', '-m', type = Path,
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
parser.add_argument("--vocab-dir", type=Path,
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
return parser.parse_args() return parser.parse_args()
def main(): def main():
cfg = handle_args() cfg = handle_args()
print(f'* Using config: {cfg}') print(f'* Using config: {cfg}')
print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n') print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
data = np.memmap(cfg.input, mode = 'r') data = np.memmap(cfg.input, mode = 'r')
model = GGMLV3Model() model = GGMLModel()
print('* Scanning GGML input file') print('* Scanning GGML input file')
offset = model.load(data, 0) offset = model.load(data, 0)
print(f'* GGML model hyperparameters: {model.hyperparameters}') print(f'* GGML model hyperparameters: {model.hyperparameters}')
@ -345,7 +438,12 @@ def main():
print(f'* Special vocab: {special_vocab}') print(f'* Special vocab: {special_vocab}')
else: else:
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override, special_vocab = special_vocab) if model.file_format == GGMLFormat.GGML:
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
converter = GGMLToGGUF(model, data, cfg,
params_override = params_override,
vocab_override = vocab_override,
special_vocab = special_vocab )
converter.save() converter.save()
print(f'* Successful completion. Output saved to: {cfg.output}') print(f'* Successful completion. Output saved to: {cfg.output}')

View file

@ -145,7 +145,6 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
class Params: class Params:
n_vocab: int n_vocab: int
n_embd: int n_embd: int
n_mult: int
n_layer: int n_layer: int
n_ctx: int n_ctx: int
n_ff: int n_ff: int
@ -161,15 +160,6 @@ class Params:
# path to the directory containing the model files # path to the directory containing the model files
path_model: Path | None = None path_model: Path | None = None
@staticmethod
def find_n_mult(n_ff: int, n_embd: int) -> int:
# hardcoded magic range
for n_mult in range(8192, 1, -1):
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
if calc_ff == n_ff:
return n_mult
raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
@staticmethod @staticmethod
def guessed(model: LazyModel) -> Params: def guessed(model: LazyModel) -> Params:
# try transformer naming first # try transformer naming first
@ -197,7 +187,6 @@ class Params:
return Params( return Params(
n_vocab = n_vocab, n_vocab = n_vocab,
n_embd = n_embd, n_embd = n_embd,
n_mult = n_mult,
n_layer = n_layer, n_layer = n_layer,
n_ctx = -1, n_ctx = -1,
n_ff = n_ff, n_ff = n_ff,
@ -225,8 +214,6 @@ class Params:
else: else:
f_rope_scale = None f_rope_scale = None
n_mult = Params.find_n_mult(n_ff, n_embd)
if "max_sequence_length" in config: if "max_sequence_length" in config:
n_ctx = config["max_sequence_length"] n_ctx = config["max_sequence_length"]
elif "max_position_embeddings" in config: elif "max_position_embeddings" in config:
@ -238,7 +225,6 @@ class Params:
return Params( return Params(
n_vocab = n_vocab, n_vocab = n_vocab,
n_embd = n_embd, n_embd = n_embd,
n_mult = n_mult,
n_layer = n_layer, n_layer = n_layer,
n_ctx = n_ctx, n_ctx = n_ctx,
n_ff = n_ff, n_ff = n_ff,
@ -250,7 +236,7 @@ class Params:
) )
# LLaMA v2 70B params.json # LLaMA v2 70B params.json
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1 # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
@staticmethod @staticmethod
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path)) config = json.load(open(config_path))
@ -258,7 +244,6 @@ class Params:
n_vocab = config["vocab_size"] if "vocab_size" in config else -1 n_vocab = config["vocab_size"] if "vocab_size" in config else -1
n_embd = config["dim"] n_embd = config["dim"]
n_layer = config["n_layers"] n_layer = config["n_layers"]
n_mult = config["multiple_of"]
n_ff = -1 n_ff = -1
n_head = config["n_heads"] n_head = config["n_heads"]
n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
@ -266,7 +251,7 @@ class Params:
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
# hack to determine LLaMA v1 vs v2 vs CodeLlama # hack to determine LLaMA v1 vs v2 vs CodeLlama
if f_rope_freq_base and f_rope_freq_base == 1000000: if f_rope_freq_base == 1000000:
# CodeLlama # CodeLlama
n_ctx = 16384 n_ctx = 16384
elif config["norm_eps"] == 1e-05: elif config["norm_eps"] == 1e-05:
@ -285,7 +270,6 @@ class Params:
return Params( return Params(
n_vocab = n_vocab, n_vocab = n_vocab,
n_embd = n_embd, n_embd = n_embd,
n_mult = n_mult,
n_layer = n_layer, n_layer = n_layer,
n_ctx = n_ctx, n_ctx = n_ctx,
n_ff = n_ff, n_ff = n_ff,
@ -673,7 +657,7 @@ class LazyUnpickler(pickle.Unpickler):
assert isinstance(pid[1], LazyStorageKind) assert isinstance(pid[1], LazyStorageKind)
data_type = pid[1].data_type data_type = pid[1].data_type
filename_stem = pid[2] filename_stem = pid[2]
filename = self.data_base_path + '/' + filename_stem filename = f'{self.data_base_path}/{filename_stem}'
info = self.zip_file.getinfo(filename) info = self.zip_file.getinfo(filename)
def load(offset: int, elm_count: int) -> NDArray: def load(offset: int, elm_count: int) -> NDArray:
@ -689,7 +673,6 @@ class LazyUnpickler(pickle.Unpickler):
@staticmethod @staticmethod
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
# pyright: ignore[reportSelfClsParameterName]
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor: requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
assert isinstance(storage, LazyStorage) assert isinstance(storage, LazyStorage)
@ -842,9 +825,9 @@ class OutputFile:
name = "LLaMA" name = "LLaMA"
# TODO: better logic to determine model name # TODO: better logic to determine model name
if (params.n_ctx == 4096): if params.n_ctx == 4096:
name = "LLaMA v2" name = "LLaMA v2"
elif params.path_model: elif params.path_model is not None:
name = str(params.path_model.parent).split('/')[-1] name = str(params.path_model.parent).split('/')[-1]
self.gguf.add_name (name) self.gguf.add_name (name)
@ -857,13 +840,13 @@ class OutputFile:
self.gguf.add_head_count_kv (params.n_head_kv) self.gguf.add_head_count_kv (params.n_head_kv)
self.gguf.add_layer_norm_rms_eps (params.f_norm_eps) self.gguf.add_layer_norm_rms_eps (params.f_norm_eps)
if params.f_rope_freq_base: if params.f_rope_freq_base is not None:
self.gguf.add_rope_freq_base(params.f_rope_freq_base) self.gguf.add_rope_freq_base(params.f_rope_freq_base)
if params.f_rope_scale: if params.f_rope_scale is not None:
self.gguf.add_rope_scale_linear(params.f_rope_scale) self.gguf.add_rope_scale_linear(params.f_rope_scale)
if params.ftype: if params.ftype is not None:
self.gguf.add_file_type(params.ftype) self.gguf.add_file_type(params.ftype)
def add_meta_vocab(self, vocab: Vocab) -> None: def add_meta_vocab(self, vocab: Vocab) -> None:

View file

@ -1,7 +1,3 @@
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "build-info.h" #include "build-info.h"

View file

@ -1,5 +1,6 @@
#include "ggml.h" #include "ggml.h"
#include "llama.h" #include "llama.h"
#include "common.h"
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
@ -499,10 +500,10 @@ struct llama_file {
errno = 0; errno = 0;
std::size_t ret = std::fread(ptr, size, 1, fp); std::size_t ret = std::fread(ptr, size, 1, fp);
if (ferror(fp)) { if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno))); die_fmt("fread failed: %s", strerror(errno));
} }
if (ret != 1) { if (ret != 1) {
throw std::runtime_error(std::string("unexpectedly reached end of file")); die("unexpectedly reached end of file");
} }
} }
@ -597,8 +598,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename); printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
llama_file file(filename, "rb"); llama_file file(filename, "rb");
if (!file.fp) { if (!file.fp) {
fprintf(stderr, "error: %s: %s\n", strerror(errno), filename); die_fmt("%s: %s", strerror(errno), filename);
exit(1);
} }
const int n_vocab = config->vocab_size; const int n_vocab = config->vocab_size;
/* uint32_t max_token_length = */ file.read_u32(); // unused /* uint32_t max_token_length = */ file.read_u32(); // unused

View file

@ -1,8 +1,3 @@
// Defines sigaction on msys:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "embd-input.h" #include "embd-input.h"
#include <cassert> #include <cassert>
@ -23,7 +18,7 @@ extern "C" {
struct MyModel* create_mymodel(int argc, char ** argv) { struct MyModel* create_mymodel(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) { if (!gpt_params_parse(argc, argv, params)) {
return nullptr; return nullptr;
} }

View file

@ -11,17 +11,12 @@
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) { if (!gpt_params_parse(argc, argv, params)) {
return 1; return 1;
} }
params.embedding = true; params.embedding = true;
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
@ -47,6 +42,12 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
const int n_ctx_train = llama_n_ctx_train(ctx);
if (params.n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
}
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");

View file

@ -76,7 +76,7 @@ bool gguf_ex_write(const std::string & fname) {
gguf_write_to_file(ctx, fname.c_str(), false); gguf_write_to_file(ctx, fname.c_str(), false);
fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str()); printf("%s: wrote file '%s;\n", __func__, fname.c_str());
ggml_free(ctx_data); ggml_free(ctx_data);
gguf_free(ctx); gguf_free(ctx);
@ -93,20 +93,20 @@ bool gguf_ex_read_0(const std::string & fname) {
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
fprintf(stdout, "%s: version: %d\n", __func__, gguf_get_version(ctx)); printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
fprintf(stdout, "%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx)); printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
// kv // kv
{ {
const int n_kv = gguf_get_n_kv(ctx); const int n_kv = gguf_get_n_kv(ctx);
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv); printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) { for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ctx, i); const char * key = gguf_get_key(ctx, i);
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key); printf("%s: kv[%d]: key = %s\n", __func__, i, key);
} }
} }
@ -116,10 +116,10 @@ bool gguf_ex_read_0(const std::string & fname) {
const int keyidx = gguf_find_key(ctx, findkey); const int keyidx = gguf_find_key(ctx, findkey);
if (keyidx == -1) { if (keyidx == -1) {
fprintf(stdout, "%s: find key: %s not found.\n", __func__, findkey); printf("%s: find key: %s not found.\n", __func__, findkey);
} else { } else {
const char * key_value = gguf_get_val_str(ctx, keyidx); const char * key_value = gguf_get_val_str(ctx, keyidx);
fprintf(stdout, "%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value); printf("%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
} }
} }
@ -127,13 +127,13 @@ bool gguf_ex_read_0(const std::string & fname) {
{ {
const int n_tensors = gguf_get_n_tensors(ctx); const int n_tensors = gguf_get_n_tensors(ctx);
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors); printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ctx, i); const char * name = gguf_get_tensor_name (ctx, i);
const size_t offset = gguf_get_tensor_offset(ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i);
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
} }
} }
@ -153,20 +153,20 @@ bool gguf_ex_read_1(const std::string & fname) {
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
fprintf(stdout, "%s: version: %d\n", __func__, gguf_get_version(ctx)); printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
fprintf(stdout, "%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx)); printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
// kv // kv
{ {
const int n_kv = gguf_get_n_kv(ctx); const int n_kv = gguf_get_n_kv(ctx);
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv); printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) { for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ctx, i); const char * key = gguf_get_key(ctx, i);
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key); printf("%s: kv[%d]: key = %s\n", __func__, i, key);
} }
} }
@ -174,13 +174,13 @@ bool gguf_ex_read_1(const std::string & fname) {
{ {
const int n_tensors = gguf_get_n_tensors(ctx); const int n_tensors = gguf_get_n_tensors(ctx);
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors); printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ctx, i); const char * name = gguf_get_tensor_name (ctx, i);
const size_t offset = gguf_get_tensor_offset(ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i);
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
} }
} }
@ -189,13 +189,13 @@ bool gguf_ex_read_1(const std::string & fname) {
const int n_tensors = gguf_get_n_tensors(ctx); const int n_tensors = gguf_get_n_tensors(ctx);
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
fprintf(stdout, "%s: reading tensor %d data\n", __func__, i); printf("%s: reading tensor %d data\n", __func__, i);
const char * name = gguf_get_tensor_name(ctx, i); const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data); printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
// print first 10 elements // print first 10 elements
const float * data = (const float *) cur->data; const float * data = (const float *) cur->data;
@ -219,7 +219,7 @@ bool gguf_ex_read_1(const std::string & fname) {
} }
} }
fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data)); printf("%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
ggml_free(ctx_data); ggml_free(ctx_data);
gguf_free(ctx); gguf_free(ctx);
@ -229,7 +229,7 @@ bool gguf_ex_read_1(const std::string & fname) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
if (argc < 3) { if (argc < 3) {
fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]); printf("usage: %s data.gguf r|w\n", argv[0]);
return -1; return -1;
} }

View file

@ -305,9 +305,9 @@ struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name)
struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str()); struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
if( cur == NULL ) { if( cur == NULL ) {
fprintf(stdout, "%s: tensor '%s' not found!\n", __func__, name.c_str()); printf("%s: tensor '%s' not found!\n", __func__, name.c_str());
} else { } else {
// fprintf(stdout, "%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name); // printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
} }
return cur; return cur;
@ -333,21 +333,21 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
return false; return false;
} }
fprintf(stdout, "%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx)); printf("%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx));
fprintf(stdout, "%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx)); printf("%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx));
fprintf(stdout, "%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx)); printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
// print all kv // print all kv
#if 0 #if 0
{ {
const int n_kv = gguf_get_n_kv(ggufctx); const int n_kv = gguf_get_n_kv(ggufctx);
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv); printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) { for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ggufctx, i); const char * key = gguf_get_key(ggufctx, i);
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key); printf("%s: kv[%d]: key = %s\n", __func__, i, key);
} }
} }
#endif #endif
@ -357,21 +357,21 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
int keyidx; int keyidx;
keyidx = gguf_find_key(ggufctx, "general.name"); keyidx = gguf_find_key(ggufctx, "general.name");
if (keyidx != -1) { fprintf(stdout, "%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.description"); keyidx = gguf_find_key(ggufctx, "general.description");
if (keyidx != -1) { fprintf(stdout, "%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.author"); keyidx = gguf_find_key(ggufctx, "general.author");
if (keyidx != -1) { fprintf(stdout, "%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.license"); keyidx = gguf_find_key(ggufctx, "general.license");
if (keyidx != -1) { fprintf(stdout, "%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.architecture"); keyidx = gguf_find_key(ggufctx, "general.architecture");
if (keyidx != -1) { fprintf(stdout, "%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.file_type"); keyidx = gguf_find_key(ggufctx, "general.file_type");
if (keyidx != -1) { fprintf(stdout, "%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
if (keyidx != -1) { fprintf(stdout, "%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository"); keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
} }
// check required metadata // check required metadata
@ -382,11 +382,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
keyidx = gguf_find_key(ggufctx, "general.architecture"); keyidx = gguf_find_key(ggufctx, "general.architecture");
if (keyidx != -1) { if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "falcon") != 0) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "falcon") != 0) {
fprintf(stdout, "%s: model architecture not supported!\n", __func__); printf("%s: model architecture not supported!\n", __func__);
return false; return false;
} }
} else { } else {
fprintf(stdout, "%s: gguf model architecture not found!\n", __func__); printf("%s: gguf model architecture not found!\n", __func__);
return false; return false;
} }
@ -394,11 +394,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
keyidx = gguf_find_key(ggufctx, "falcon.tensor_data_layout"); keyidx = gguf_find_key(ggufctx, "falcon.tensor_data_layout");
if (keyidx != -1) { if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "jploski") != 0) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "jploski") != 0) {
fprintf(stdout, "%s: model tensor data layout not supported!\n", __func__); printf("%s: model tensor data layout not supported!\n", __func__);
return false; return false;
} }
} else { } else {
fprintf(stdout, "%s: gguf model tensor data layout not found!\n", __func__); printf("%s: gguf model tensor data layout not found!\n", __func__);
return false; return false;
} }
@ -455,11 +455,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
if (keyidx != -1) { if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
fprintf(stdout, "%s: tokenizer model not supported!\n", __func__); printf("%s: tokenizer model not supported!\n", __func__);
return false; return false;
} }
} else { } else {
fprintf(stdout, "%s: tokenizer model not found!\n", __func__); printf("%s: tokenizer model not found!\n", __func__);
return false; return false;
} }
@ -467,22 +467,22 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens"); int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
if (tokens_keyidx == -1) { if (tokens_keyidx == -1) {
fprintf(stdout, "%s: gpt2 tokenizer vocab not found!\n", __func__); printf("%s: gpt2 tokenizer vocab not found!\n", __func__);
return false; return false;
} }
int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges"); int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");
if (merges_keyidx == -1) { if (merges_keyidx == -1) {
fprintf(stdout, "%s: gpt2 tokenizer merges not found!\n", __func__); printf("%s: gpt2 tokenizer merges not found!\n", __func__);
return false; return false;
} }
hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx); hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx); hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);
fprintf(stdout, "%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab); printf("%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab);
fprintf(stdout, "%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges); printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
for (size_t i = 0; i < hparams.n_vocab; i++) { for (size_t i = 0; i < hparams.n_vocab; i++) {
std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i); std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
@ -523,12 +523,12 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
if( vocab.special_bos_id != -1 ) { fprintf(stdout, "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); } if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
if( vocab.special_eos_id != -1 ) { fprintf(stdout, "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); } if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
if( vocab.special_unk_id != -1 ) { fprintf(stdout, "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); } if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
if( vocab.special_sep_id != -1 ) { fprintf(stdout, "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); } if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
if( vocab.special_pad_id != -1 ) { fprintf(stdout, "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); } if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
if( vocab.linefeed_id != -1 ) { fprintf(stdout, "%s: LF token = %d\n", __func__, vocab.linefeed_id ); } if( vocab.linefeed_id != -1 ) { printf("%s: LF token = %d\n", __func__, vocab.linefeed_id ); }
} }
@ -543,13 +543,13 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
{ {
const int n_tensors = gguf_get_n_tensors(ggufctx); const int n_tensors = gguf_get_n_tensors(ggufctx);
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors); printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ggufctx, i); const char * name = gguf_get_tensor_name (ggufctx, i);
const size_t offset = gguf_get_tensor_offset(ggufctx, i); const size_t offset = gguf_get_tensor_offset(ggufctx, i);
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
} }
} }
#endif #endif
@ -953,7 +953,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) { if (!gpt_params_parse(argc, argv, params)) {
return 1; return 1;
} }

View file

@ -318,9 +318,9 @@ struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name)
struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str()); struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
if( cur == NULL ) { if( cur == NULL ) {
fprintf(stdout, "%s: tensor '%s' not found!\n", __func__, name.c_str()); printf("%s: tensor '%s' not found!\n", __func__, name.c_str());
} else { } else {
// fprintf(stdout, "%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name); // printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
} }
return cur; return cur;
@ -346,21 +346,21 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
return false; return false;
} }
fprintf(stdout, "%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx)); printf("%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx));
fprintf(stdout, "%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx)); printf("%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx));
fprintf(stdout, "%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx)); printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
// print all kv // print all kv
#if 0 #if 0
{ {
const int n_kv = gguf_get_n_kv(ggufctx); const int n_kv = gguf_get_n_kv(ggufctx);
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv); printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) { for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ggufctx, i); const char * key = gguf_get_key(ggufctx, i);
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key); printf("%s: kv[%d]: key = %s\n", __func__, i, key);
} }
} }
#endif #endif
@ -370,21 +370,21 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
int keyidx; int keyidx;
keyidx = gguf_find_key(ggufctx, "general.name"); keyidx = gguf_find_key(ggufctx, "general.name");
if (keyidx != -1) { fprintf(stdout, "%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.description"); keyidx = gguf_find_key(ggufctx, "general.description");
if (keyidx != -1) { fprintf(stdout, "%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.author"); keyidx = gguf_find_key(ggufctx, "general.author");
if (keyidx != -1) { fprintf(stdout, "%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.license"); keyidx = gguf_find_key(ggufctx, "general.license");
if (keyidx != -1) { fprintf(stdout, "%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.architecture"); keyidx = gguf_find_key(ggufctx, "general.architecture");
if (keyidx != -1) { fprintf(stdout, "%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.file_type"); keyidx = gguf_find_key(ggufctx, "general.file_type");
if (keyidx != -1) { fprintf(stdout, "%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
if (keyidx != -1) { fprintf(stdout, "%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository"); keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
} }
// check required metadata // check required metadata
@ -395,11 +395,11 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
keyidx = gguf_find_key(ggufctx, "general.architecture"); keyidx = gguf_find_key(ggufctx, "general.architecture");
if (keyidx != -1) { if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) {
fprintf(stdout, "%s: model architecture not supported!\n", __func__); printf("%s: model architecture not supported!\n", __func__);
return false; return false;
} }
} else { } else {
fprintf(stdout, "%s: gguf model architecture not found!\n", __func__); printf("%s: gguf model architecture not found!\n", __func__);
return false; return false;
} }
@ -456,11 +456,11 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
if (keyidx != -1) { if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
fprintf(stdout, "%s: tokenizer model not supported!\n", __func__); printf("%s: tokenizer model not supported!\n", __func__);
return false; return false;
} }
} else { } else {
fprintf(stdout, "%s: tokenizer model not found!\n", __func__); printf("%s: tokenizer model not found!\n", __func__);
return false; return false;
} }
@ -468,22 +468,22 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens"); int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
if (tokens_keyidx == -1) { if (tokens_keyidx == -1) {
fprintf(stdout, "%s: gpt2 tokenizer vocab not found!\n", __func__); printf("%s: gpt2 tokenizer vocab not found!\n", __func__);
return false; return false;
} }
int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges"); int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");
if (merges_keyidx == -1) { if (merges_keyidx == -1) {
fprintf(stdout, "%s: gpt2 tokenizer merges not found!\n", __func__); printf("%s: gpt2 tokenizer merges not found!\n", __func__);
return false; return false;
} }
hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx); hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx); hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);
fprintf(stdout, "%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab); printf("%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab);
fprintf(stdout, "%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges); printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
for (size_t i = 0; i < hparams.n_vocab; i++) { for (size_t i = 0; i < hparams.n_vocab; i++) {
std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i); std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
@ -524,12 +524,12 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
if( vocab.special_bos_id != -1 ) { fprintf(stdout, "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); } if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
if( vocab.special_eos_id != -1 ) { fprintf(stdout, "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); } if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
if( vocab.special_unk_id != -1 ) { fprintf(stdout, "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); } if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
if( vocab.special_sep_id != -1 ) { fprintf(stdout, "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); } if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
if( vocab.special_pad_id != -1 ) { fprintf(stdout, "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); } if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
if( vocab.linefeed_id != -1 ) { fprintf(stdout, "%s: LF token = %d\n", __func__, vocab.linefeed_id ); } if( vocab.linefeed_id != -1 ) { printf("%s: LF token = %d\n", __func__, vocab.linefeed_id ); }
} }
@ -543,13 +543,13 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
{ {
const int n_tensors = gguf_get_n_tensors(ggufctx); const int n_tensors = gguf_get_n_tensors(ggufctx);
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors); printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ggufctx, i); const char * name = gguf_get_tensor_name (ggufctx, i);
const size_t offset = gguf_get_tensor_offset(ggufctx, i); const size_t offset = gguf_get_tensor_offset(ggufctx, i);
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
} }
} }
#endif #endif
@ -925,7 +925,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) { if (!gpt_params_parse(argc, argv, params)) {
return 1; return 1;
} }

47
examples/llama-bench/llama-bench.cpp Executable file → Normal file
View file

@ -165,26 +165,26 @@ static const cmd_params cmd_params_defaults = {
}; };
static void print_usage(int /* argc */, char ** argv) { static void print_usage(int /* argc */, char ** argv) {
fprintf(stdout, "usage: %s [options]\n", argv[0]); printf("usage: %s [options]\n", argv[0]);
fprintf(stdout, "\n"); printf("\n");
fprintf(stdout, "options:\n"); printf("options:\n");
fprintf(stdout, " -h, --help\n"); printf(" -h, --help\n");
fprintf(stdout, " -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
fprintf(stdout, " -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
fprintf(stdout, " -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
fprintf(stdout, " -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
fprintf(stdout, " --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str()); printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
fprintf(stdout, " -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
fprintf(stdout, " -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
fprintf(stdout, " -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
fprintf(stdout, " -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str()); printf(" -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
fprintf(stdout, " -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
fprintf(stdout, " -ts, --tensor_split <ts0/ts1/..> \n"); printf(" -ts, --tensor_split <ts0/ts1/..> \n");
fprintf(stdout, " -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps); printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
fprintf(stdout, " -o, --output <csv|json|md|sql> (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql"); printf(" -o, --output <csv|json|md|sql> (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
fprintf(stdout, " -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
fprintf(stdout, "\n"); printf("\n");
fprintf(stdout, "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
} }
@ -986,7 +986,12 @@ int main(int argc, char ** argv) {
test t(inst, lmodel, ctx); test t(inst, lmodel, ctx);
// warmup run // warmup run
test_gen(ctx, 1, 0, t.n_threads); if (t.n_prompt > 0) {
test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads);
}
if (t.n_gen > 0) {
test_gen(ctx, 1, 0, t.n_threads);
}
for (int i = 0; i < params.reps; i++) { for (int i = 0; i < params.reps; i++) {
uint64_t t_start = get_time_ns(); uint64_t t_start = get_time_ns();

View file

@ -1,8 +1,3 @@
// Defines sigaction on msys:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "common.h" #include "common.h"
#include "console.h" #include "console.h"
@ -48,8 +43,9 @@ static bool is_interacting = false;
void write_logfile( void write_logfile(
const llama_context * ctx, const gpt_params & params, const llama_model * model, const llama_context * ctx, const gpt_params & params, const llama_model * model,
const std::vector<llama_token> input_tokens, const std::string output, const std::vector<llama_token> output_tokens) { const std::vector<llama_token> & input_tokens, const std::string & output,
const std::vector<llama_token> & output_tokens
) {
if (params.logdir.empty()) { if (params.logdir.empty()) {
return; return;
} }
@ -109,7 +105,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
g_params = &params; g_params = &params;
if (gpt_params_parse(argc, argv, params) == false) { if (!gpt_params_parse(argc, argv, params)) {
return 1; return 1;
} }
@ -151,14 +147,6 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale); LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
} }
if (params.n_ctx > 2048) {
// TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
LOG_TEE("%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
} else if (params.n_ctx < 8) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
@ -194,6 +182,15 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
const int n_ctx_train = llama_n_ctx_train(ctx);
if (params.n_ctx > n_ctx_train) {
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
} else if (params.n_ctx < 8) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
// print system information // print system information
{ {
LOG_TEE("\n"); LOG_TEE("\n");
@ -304,7 +301,7 @@ int main(int argc, char ** argv) {
// debug message about similarity of saved session, if applicable // debug message about similarity of saved session, if applicable
size_t n_matching_session_tokens = 0; size_t n_matching_session_tokens = 0;
if (session_tokens.size() > 0) { if (!session_tokens.empty()) {
for (llama_token id : session_tokens) { for (llama_token id : session_tokens) {
if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) { if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
break; break;
@ -402,7 +399,7 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: interactive mode on.\n", __func__); LOG_TEE("%s: interactive mode on.\n", __func__);
if (params.antiprompt.size()) { if (!params.antiprompt.empty()) {
for (const auto & antiprompt : params.antiprompt) { for (const auto & antiprompt : params.antiprompt) {
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str()); LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
} }
@ -500,7 +497,7 @@ int main(int argc, char ** argv) {
while ((n_remain != 0 && !is_antiprompt) || params.interactive) { while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict // predict
if (embd.size() > 0) { if (!embd.empty()) {
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
// --prompt or --file which uses the same value. // --prompt or --file which uses the same value.
int max_embd_size = n_ctx - 4; int max_embd_size = n_ctx - 4;
@ -625,7 +622,7 @@ int main(int argc, char ** argv) {
LOG("n_past = %d\n", n_past); LOG("n_past = %d\n", n_past);
} }
if (embd.size() > 0 && !path_session.empty()) { if (!embd.empty() && !path_session.empty()) {
session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
n_session_consumed = session_tokens.size(); n_session_consumed = session_tokens.size();
} }
@ -696,7 +693,7 @@ int main(int argc, char ** argv) {
// if not currently processing queued inputs; // if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) { if ((int) embd_inp.size() <= n_consumed) {
// check for reverse prompt // check for reverse prompt
if (params.antiprompt.size()) { if (!params.antiprompt.empty()) {
std::string last_output; std::string last_output;
for (auto id : last_tokens) { for (auto id : last_tokens) {
last_output += llama_token_to_piece(ctx, id); last_output += llama_token_to_piece(ctx, id);
@ -733,7 +730,7 @@ int main(int argc, char ** argv) {
LOG("found EOS token\n"); LOG("found EOS token\n");
if (params.interactive) { if (params.interactive) {
if (params.antiprompt.size() != 0) { if (!params.antiprompt.empty()) {
// tokenize and inject first reverse prompt // tokenize and inject first reverse prompt
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false); const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());

View file

@ -368,7 +368,7 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
// Example, we have a context window of 512, we will compute perplexity for each of the // Example, we have a context window of 512, we will compute perplexity for each of the
// last 256 tokens. Then, we split the input up into context window size chunks to // last 256 tokens. Then, we split the input up into context window size chunks to
// process the entire prompt. // process the entire prompt.
const int first = std::min(512, params.n_ctx/2); const int first = params.n_ctx/2;
process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first, process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
count += params.n_ctx - first - 1; count += params.n_ctx - first - 1;
@ -655,7 +655,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
params.n_batch = 512; params.n_batch = 512;
if (gpt_params_parse(argc, argv, params) == false) { if (!gpt_params_parse(argc, argv, params)) {
return 1; return 1;
} }
@ -668,11 +668,6 @@ int main(int argc, char ** argv) {
params.n_ctx += params.ppl_stride/2; params.n_ctx += params.ppl_stride/2;
} }
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
@ -698,6 +693,12 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
const int n_ctx_train = llama_n_ctx_train(ctx);
if (params.n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
}
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");

View file

@ -71,7 +71,7 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
} }
// Check if a layer is included/excluded by command line // Check if a layer is included/excluded by command line
bool layer_included(const quantize_stats_params params, const std::string & layer) { bool layer_included(const quantize_stats_params & params, const std::string & layer) {
for (const auto& excluded : params.exclude_layers) { for (const auto& excluded : params.exclude_layers) {
if (std::regex_search(layer, std::regex(excluded))) { if (std::regex_search(layer, std::regex(excluded))) {
return false; return false;

View file

@ -143,10 +143,9 @@ int main(int argc, char ** argv) {
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]); fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
return 1; return 1;
} else { }
if (ftype_str == "COPY") { if (ftype_str == "COPY") {
params.only_copy = true; params.only_copy = true;
}
} }
arg_idx++; arg_idx++;
} }

View file

@ -13,7 +13,7 @@ int main(int argc, char ** argv) {
params.repeat_last_n = 64; params.repeat_last_n = 64;
params.prompt = "The quick brown fox"; params.prompt = "The quick brown fox";
if (gpt_params_parse(argc, argv, params) == false) { if (!gpt_params_parse(argc, argv, params)) {
return 1; return 1;
} }
@ -44,7 +44,7 @@ int main(int argc, char ** argv) {
llama_free_model(model); llama_free_model(model);
return 1; return 1;
} }
auto tokens = llama_tokenize(ctx, params.prompt.c_str(), true); auto tokens = llama_tokenize(ctx, params.prompt, true);
auto n_prompt_tokens = tokens.size(); auto n_prompt_tokens = tokens.size();
if (n_prompt_tokens < 1) { if (n_prompt_tokens < 1) {
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__); fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);

View file

@ -118,7 +118,7 @@ static void server_log(const char *level, const char *function, int line,
} }
const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
fprintf(stdout, "%.*s\n", (int)str.size(), str.data()); printf("%.*s\n", (int)str.size(), str.data());
fflush(stdout); fflush(stdout);
} }
@ -139,7 +139,7 @@ static std::string tokens_to_output_formatted_string(const llama_context *ctx, c
} }
// convert a vector of completion_token_output to json // convert a vector of completion_token_output to json
static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> probs) static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> & probs)
{ {
json out = json::array(); json out = json::array();
for (const auto &prob : probs) for (const auto &prob : probs)
@ -271,7 +271,7 @@ struct llama_server_context
return true; return true;
} }
std::vector<llama_token> tokenize(json json_prompt, bool add_bos) std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
{ {
// If `add_bos` is true, we only add BOS, when json_prompt is a string, // If `add_bos` is true, we only add BOS, when json_prompt is a string,
// or the first element of the json_prompt array is a string. // or the first element of the json_prompt array is a string.
@ -611,7 +611,7 @@ struct llama_server_context
completion_token_output doCompletion() completion_token_output doCompletion()
{ {
const completion_token_output token_with_probs = nextToken(); auto token_with_probs = nextToken();
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok); const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
generated_text += token_text; generated_text += token_text;
@ -694,50 +694,50 @@ struct llama_server_context
static void server_print_usage(const char *argv0, const gpt_params &params, static void server_print_usage(const char *argv0, const gpt_params &params,
const server_params &sparams) const server_params &sparams)
{ {
fprintf(stdout, "usage: %s [options]\n", argv0); printf("usage: %s [options]\n", argv0);
fprintf(stdout, "\n"); printf("\n");
fprintf(stdout, "options:\n"); printf("options:\n");
fprintf(stdout, " -h, --help show this help message and exit\n"); printf(" -h, --help show this help message and exit\n");
fprintf(stdout, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); printf(" --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); printf(" --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
if (llama_mlock_supported()) if (llama_mlock_supported())
{ {
fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
} }
if (llama_mmap_supported()) if (llama_mmap_supported())
{ {
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
} }
fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n"); printf(" --numa attempt optimizations that help on some NUMA systems\n");
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stdout, " -ngl N, --n-gpu-layers N\n"); printf(" -ngl N, --n-gpu-layers N\n");
fprintf(stdout, " number of layers to store in VRAM\n"); printf(" number of layers to store in VRAM\n");
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n"); printf(" -ts SPLIT --tensor-split SPLIT\n");
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n");
fprintf(stdout, " -nommq, --no-mul-mat-q\n"); printf(" -nommq, --no-mul-mat-q\n");
fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n"); printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n"); printf(" Not recommended since this is both slower and uses more VRAM.\n");
#endif #endif
fprintf(stdout, " -m FNAME, --model FNAME\n"); printf(" -m FNAME, --model FNAME\n");
fprintf(stdout, " model path (default: %s)\n", params.model.c_str()); printf(" model path (default: %s)\n", params.model.c_str());
fprintf(stdout, " -a ALIAS, --alias ALIAS\n"); printf(" -a ALIAS, --alias ALIAS\n");
fprintf(stdout, " set an alias for the model, will be added as `model` field in completion response\n"); printf(" set an alias for the model, will be added as `model` field in completion response\n");
fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stdout, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
fprintf(stdout, " --port PORT port to listen (default (default: %d)\n", sparams.port); printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
fprintf(stdout, " --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
fprintf(stdout, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
fprintf(stdout, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
fprintf(stdout, "\n"); printf("\n");
} }
static void server_params_parse(int argc, char **argv, server_params &sparams, static void server_params_parse(int argc, char **argv, server_params &sparams,
@ -1255,7 +1255,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
struct token_translator { struct token_translator {
llama_context * ctx; llama_context * ctx;
std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); } std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); } std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); }
}; };
void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) { void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {
@ -1595,7 +1595,7 @@ int main(int argc, char **argv)
svr.set_base_dir(sparams.public_path); svr.set_base_dir(sparams.public_path);
// to make it ctrl+clickable: // to make it ctrl+clickable:
fprintf(stdout, "\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); printf("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
LOG_INFO("HTTP server listening", { LOG_INFO("HTTP server listening", {
{"hostname", sparams.hostname}, {"hostname", sparams.hostname},

View file

@ -1,7 +1,3 @@
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "build-info.h" #include "build-info.h"
#include "common.h" #include "common.h"

View file

@ -1,11 +1,8 @@
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "build-info.h" #include "build-info.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "grammar-parser.h"
#include <cmath> #include <cmath>
#include <cstdio> #include <cstdio>
@ -109,16 +106,35 @@ int main(int argc, char ** argv) {
// used to determine end of generation // used to determine end of generation
bool has_eos = false; bool has_eos = false;
// grammar stuff
struct llama_grammar * grammar_dft = NULL;
struct llama_grammar * grammar_tgt = NULL;
grammar_parser::parse_state parsed_grammar;
// if requested - load the grammar, error checking is omitted for brevity
if (!params.grammar.empty()) {
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
// will be empty (default) if there are parse errors
if (parsed_grammar.rules.empty()) {
return 1;
}
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
}
const auto t_dec_start = ggml_time_us(); const auto t_dec_start = ggml_time_us();
while (true) { while (true) {
LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted)); LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
// sample from the drafted tokens if any
int i_dft = 0; int i_dft = 0;
while (true) { while (true) {
const llama_token id = llama_sample_token(ctx_tgt, NULL, NULL, params, last_tokens, candidates, i_dft); // sample from the target model
const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
// remember which tokens were sampled - used for repetition penalties during sampling
last_tokens.erase(last_tokens.begin()); last_tokens.erase(last_tokens.begin());
last_tokens.push_back(id); last_tokens.push_back(id);
@ -134,8 +150,9 @@ int main(int argc, char ** argv) {
++n_predict; ++n_predict;
// check if the draft matches the target
if (i_dft < (int) drafted.size() && id == drafted[i_dft]) { if (i_dft < (int) drafted.size() && id == drafted[i_dft]) {
LOG("drafted token %d accepted\n", id); LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
++n_accept; ++n_accept;
++n_past_tgt; ++n_past_tgt;
++n_past_dft; ++n_past_dft;
@ -145,6 +162,14 @@ int main(int argc, char ** argv) {
} }
// the drafted token was rejected or we are out of drafted tokens // the drafted token was rejected or we are out of drafted tokens
if (i_dft < (int) drafted.size()) {
LOG("the %dth drafted token (%d, '%s') does not match the sampled target token (%d, '%s') - rejected\n",
i_dft, drafted[i_dft], llama_token_to_piece(ctx_dft, drafted[i_dft]).c_str(), id, token_str.c_str());
} else {
LOG("out of drafted tokens\n");
}
llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads); llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
++n_past_dft; ++n_past_dft;
@ -158,7 +183,16 @@ int main(int argc, char ** argv) {
break; break;
} }
// sample n_draft tokens from the draft model picking the best token if (grammar_tgt) {
if (grammar_dft) {
llama_grammar_free(grammar_dft);
}
grammar_dft = llama_grammar_copy(grammar_tgt);
LOG("copied target grammar to draft grammar\n");
}
// sample n_draft tokens from the draft model using greedy decoding
int n_past_cur = n_past_dft; int n_past_cur = n_past_dft;
for (int i = 0; i < n_draft; ++i) { for (int i = 0; i < n_draft; ++i) {
float * logits = llama_get_logits(ctx_dft); float * logits = llama_get_logits(ctx_dft);
@ -170,25 +204,40 @@ int main(int argc, char ** argv) {
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false }; llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
if (grammar_dft != NULL) {
llama_sample_grammar(ctx_dft, &cur_p, grammar_dft);
}
// computes softmax and sorts the candidates // computes softmax and sorts the candidates
llama_sample_softmax(ctx_dft, &cur_p); llama_sample_softmax(ctx_dft, &cur_p);
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
LOG(" - draft candidate %d: %d (%.3f)\n", i, cur_p.data[i].id, cur_p.data[i].p); LOG(" - draft candidate %3d: %6d (%8.3f) '%s'\n", i, cur_p.data[i].id, cur_p.data[i].p, llama_token_to_piece(ctx_dft, cur_p.data[i].id).c_str());
} }
// too low probability, stop drafting // TODO: better logic?
if (cur_p.data[0].p < 2*cur_p.data[1].p) { if (cur_p.data[0].p < 2*cur_p.data[1].p) {
LOG("stopping drafting, probability too low: %.3f < 2*%.3f\n", cur_p.data[0].p, cur_p.data[1].p);
break; break;
} }
drafted.push_back(cur_p.data[0].id); // drafted token
const llama_token id = cur_p.data[0].id;
drafted.push_back(id);
++n_drafted; ++n_drafted;
if (i < n_draft - 1) { // no need to evaluate the last drafted token, since we won't use the result
// evaluate the drafted token on the draft model if (i == n_draft - 1) {
llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads); break;
++n_past_cur; }
// evaluate the drafted token on the draft model
llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads);
++n_past_cur;
if (grammar_dft != NULL) {
llama_grammar_accept_token(ctx_dft, grammar_dft, id);
} }
} }
@ -196,6 +245,7 @@ int main(int argc, char ** argv) {
llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads); llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads);
++n_past_tgt; ++n_past_tgt;
// the first token is always proposed by the traget model before the speculation loop
drafted.erase(drafted.begin()); drafted.erase(drafted.begin());
} }
@ -226,6 +276,10 @@ int main(int argc, char ** argv) {
llama_free(ctx_dft); llama_free(ctx_dft);
llama_free_model(model_dft); llama_free_model(model_dft);
if (grammar_dft != NULL) {
llama_grammar_free(grammar_dft);
llama_grammar_free(grammar_tgt);
}
llama_backend_free(); llama_backend_free();
fprintf(stderr, "\n\n"); fprintf(stderr, "\n\n");

View file

@ -169,10 +169,6 @@ struct my_llama_hparams {
float rope_freq_base = 10000.0f; float rope_freq_base = 10000.0f;
float rope_freq_scale = 1.0f; float rope_freq_scale = 1.0f;
bool operator!=(const my_llama_hparams& other) const {
return memcmp(this, &other, sizeof(my_llama_hparams));
}
}; };
struct my_llama_layer { struct my_llama_layer {
@ -929,28 +925,6 @@ void get_example_targets_batch(struct llama_context * lctx, const int * train_sa
} }
} }
#ifdef __GNUC__
#ifdef __MINGW32__
__attribute__((format(gnu_printf, 1, 2)))
#else
__attribute__((format(printf, 1, 2)))
#endif
#endif
static std::string format(const char * fmt, ...) {
va_list ap, ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX);
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), size);
}
int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) { int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) {
FILE * fp = std::fopen(filename, "rb"); FILE * fp = std::fopen(filename, "rb");
if (fp == NULL) { if (fp == NULL) {
@ -983,10 +957,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
out.resize(size+1); out.resize(size+1);
if (std::fread(buf.data(), size, 1, fp) != 1) { if (std::fread(buf.data(), size, 1, fp) != 1) {
throw std::runtime_error(std::string("unexpectedly reached end of file")); die("unexpectedly reached end of file");
} }
if (ferror(fp)) { if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno))); die_fmt("fread failed: %s", strerror(errno));
} }
buf[size] = '\0'; buf[size] = '\0';
@ -1047,11 +1021,11 @@ void shuffle_ints(int * begin, int * end) {
if (kid >= 0) { \ if (kid >= 0) { \
enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
if (ktype != (type)) { \ if (ktype != (type)) { \
throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \ die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \
} \ } \
(dst) = func(ctx, kid); \ (dst) = func(ctx, kid); \
} else if (req) { \ } else if (req) { \
throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \ die_fmt("key not found in model: %s", skey.c_str()); \
} \ } \
} }
@ -1136,7 +1110,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
read_tensor_by_name(opt->lbfgs.lms, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); read_tensor_by_name(opt->lbfgs.lms, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
read_tensor_by_name(opt->lbfgs.lmy, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); read_tensor_by_name(opt->lbfgs.lmy, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
} else { } else {
throw std::runtime_error("unknown optimizer type\n"); die("unknown optimizer type");
} }
} }
@ -1315,20 +1289,20 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod
const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST)); const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST));
if (token_idx == -1) { if (token_idx == -1) {
throw std::runtime_error("cannot find tokenizer vocab in model file\n"); die("cannot find tokenizer vocab in model file");
} }
const uint32_t n_vocab = gguf_get_arr_n(vctx, token_idx); const uint32_t n_vocab = gguf_get_arr_n(vctx, token_idx);
const int score_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_SCORES)); const int score_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_SCORES));
if (score_idx == -1) { if (score_idx == -1) {
throw std::runtime_error("cannot find tokenizer scores in model file\n"); die("cannot find tokenizer scores in model file");
} }
const float * scores = (const float * ) gguf_get_arr_data(vctx, score_idx); const float * scores = (const float * ) gguf_get_arr_data(vctx, score_idx);
const int toktype_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE)); const int toktype_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE));
if (toktype_idx == -1) { if (toktype_idx == -1) {
throw std::runtime_error("cannot find token type list in GGUF file\n"); die("cannot find token type list in GGUF file");
} }
const int * toktypes = (const int * ) gguf_get_arr_data(vctx, toktype_idx); const int * toktypes = (const int * ) gguf_get_arr_data(vctx, toktype_idx);
@ -1356,7 +1330,7 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod
// read and copy bpe merges // read and copy bpe merges
const int merges_keyidx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_MERGES)); const int merges_keyidx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_MERGES));
if (merges_keyidx == -1) { if (merges_keyidx == -1) {
throw std::runtime_error("cannot find tokenizer merges in model file\n"); die("cannot find tokenizer merges in model file");
} }
const int n_merges = gguf_get_arr_n(vctx, merges_keyidx); const int n_merges = gguf_get_arr_n(vctx, merges_keyidx);
@ -1988,7 +1962,7 @@ void opt_callback(void * vdata, float * sched) {
float min_sched = params->adam_min_alpha / params->adam_alpha; float min_sched = params->adam_min_alpha / params->adam_alpha;
*sched = min_sched + *sched * (1.0f - min_sched); *sched = min_sched + *sched * (1.0f - min_sched);
int impr_plot = std::isnan(opt->loss_after) ? 0 : -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f); int impr_plot = std::isnan(opt->loss_after) ? 0 : -std::lround(1 + (opt->loss_before - opt->loss_after) * 10.0f);
printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0); printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0);
if (data->shuffle_countdown < n_batch) { if (data->shuffle_countdown < n_batch) {

View file

@ -93,6 +93,10 @@
type = "app"; type = "app";
program = "${self.packages.${system}.default}/bin/quantize"; program = "${self.packages.${system}.default}/bin/quantize";
}; };
apps.train-text-from-scratch = {
type = "app";
program = "${self.packages.${system}.default}/bin/train-text-from-scratch";
};
apps.default = self.apps.${system}.llama; apps.default = self.apps.${system}.llama;
devShells.default = pkgs.mkShell { devShells.default = pkgs.mkShell {
buildInputs = [ llama-python ]; buildInputs = [ llama-python ];

View file

@ -1,8 +1,3 @@
// defines MAP_ANONYMOUS
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml.h" #include "ggml.h"
#include <assert.h> #include <assert.h>
@ -138,7 +133,7 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
#endif #endif
size_t size = ggml_allocr_get_alloc_size(alloc, tensor); size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
@ -165,14 +160,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
if (best_fit_block == -1) { if (best_fit_block == -1) {
// the last block is our last resort // the last block is our last resort
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1]; struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
max_avail = MAX(max_avail, block->size);
if (block->size >= size) { if (block->size >= size) {
best_fit_block = alloc->n_free_blocks - 1; best_fit_block = alloc->n_free_blocks - 1;
max_avail = MAX(max_avail, block->size);
} else { } else {
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
__func__, size, max_avail); __func__, size, max_avail);
GGML_ASSERT(!"not enough space in the buffer"); GGML_ASSERT(!"not enough space in the buffer");
return; return;
} }
} }
struct free_block * block = &alloc->free_blocks[best_fit_block]; struct free_block * block = &alloc->free_blocks[best_fit_block];
@ -316,7 +311,11 @@ static void * alloc_vmem(size_t size) {
#if defined(_WIN32) #if defined(_WIN32)
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS); return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
#elif defined(_POSIX_MAPPED_FILES) #elif defined(_POSIX_MAPPED_FILES)
return mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0); void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
if (ptr == MAP_FAILED) {
return NULL;
}
return ptr;
#else #else
// use a fixed address for other platforms // use a fixed address for other platforms
uintptr_t base_addr = (uintptr_t)-size - 0x100; uintptr_t base_addr = (uintptr_t)-size - 0x100;

File diff suppressed because it is too large Load diff

View file

@ -63,7 +63,9 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(relu); GGML_METAL_DECL_KERNEL(relu);
GGML_METAL_DECL_KERNEL(gelu); GGML_METAL_DECL_KERNEL(gelu);
GGML_METAL_DECL_KERNEL(soft_max); GGML_METAL_DECL_KERNEL(soft_max);
GGML_METAL_DECL_KERNEL(soft_max_4);
GGML_METAL_DECL_KERNEL(diag_mask_inf); GGML_METAL_DECL_KERNEL(diag_mask_inf);
GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
GGML_METAL_DECL_KERNEL(get_rows_f16); GGML_METAL_DECL_KERNEL(get_rows_f16);
GGML_METAL_DECL_KERNEL(get_rows_q4_0); GGML_METAL_DECL_KERNEL(get_rows_q4_0);
GGML_METAL_DECL_KERNEL(get_rows_q4_1); GGML_METAL_DECL_KERNEL(get_rows_q4_1);
@ -77,6 +79,7 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(norm); GGML_METAL_DECL_KERNEL(norm);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32); GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row); GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32); GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
@ -117,14 +120,17 @@ static NSString * const msl_library_source = @"see metal.metal";
struct ggml_metal_context * ggml_metal_init(int n_cb) { struct ggml_metal_context * ggml_metal_init(int n_cb) {
metal_printf("%s: allocating\n", __func__); metal_printf("%s: allocating\n", __func__);
// Show all the Metal device instances in the system
NSArray * devices = MTLCopyAllDevices();
id <MTLDevice> device; id <MTLDevice> device;
NSString * s; NSString * s;
#if TARGET_OS_OSX
// Show all the Metal device instances in the system
NSArray * devices = MTLCopyAllDevices();
for (device in devices) { for (device in devices) {
s = [device name]; s = [device name];
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]); metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
} }
#endif
// Pick and show default Metal device // Pick and show default Metal device
device = MTLCreateSystemDefaultDevice(); device = MTLCreateSystemDefaultDevice();
@ -141,12 +147,20 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT); ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
#if 0 #ifdef GGML_SWIFT
// compile from source string and show compile log // load the default.metallib file
{ {
NSError * error = nil; NSError * error = nil;
ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error]; NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"];
NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath];
NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"];
NSURL * libURL = [NSURL fileURLWithPath:libPath];
// Load the metallib file into a Metal library
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
if (error) { if (error) {
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL; return NULL;
@ -207,7 +221,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(relu); GGML_METAL_ADD_KERNEL(relu);
GGML_METAL_ADD_KERNEL(gelu); GGML_METAL_ADD_KERNEL(gelu);
GGML_METAL_ADD_KERNEL(soft_max); GGML_METAL_ADD_KERNEL(soft_max);
GGML_METAL_ADD_KERNEL(soft_max_4);
GGML_METAL_ADD_KERNEL(diag_mask_inf); GGML_METAL_ADD_KERNEL(diag_mask_inf);
GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
GGML_METAL_ADD_KERNEL(get_rows_f16); GGML_METAL_ADD_KERNEL(get_rows_f16);
GGML_METAL_ADD_KERNEL(get_rows_q4_0); GGML_METAL_ADD_KERNEL(get_rows_q4_0);
GGML_METAL_ADD_KERNEL(get_rows_q4_1); GGML_METAL_ADD_KERNEL(get_rows_q4_1);
@ -221,6 +237,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(norm); GGML_METAL_ADD_KERNEL(norm);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32); GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row); GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32); GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
@ -247,13 +264,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
#undef GGML_METAL_ADD_KERNEL #undef GGML_METAL_ADD_KERNEL
} }
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
#if TARGET_OS_OSX
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
if (ctx->device.maxTransferRate != 0) { if (ctx->device.maxTransferRate != 0) {
metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
} else { } else {
metal_printf("%s: maxTransferRate = built-in GPU\n", __func__); metal_printf("%s: maxTransferRate = built-in GPU\n", __func__);
} }
#endif
return ctx; return ctx;
} }
@ -273,7 +292,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(relu); GGML_METAL_DEL_KERNEL(relu);
GGML_METAL_DEL_KERNEL(gelu); GGML_METAL_DEL_KERNEL(gelu);
GGML_METAL_DEL_KERNEL(soft_max); GGML_METAL_DEL_KERNEL(soft_max);
GGML_METAL_DEL_KERNEL(diag_mask_inf); GGML_METAL_DEL_KERNEL(soft_max_4);
GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
GGML_METAL_DEL_KERNEL(get_rows_f16); GGML_METAL_DEL_KERNEL(get_rows_f16);
GGML_METAL_DEL_KERNEL(get_rows_q4_0); GGML_METAL_DEL_KERNEL(get_rows_q4_0);
GGML_METAL_DEL_KERNEL(get_rows_q4_1); GGML_METAL_DEL_KERNEL(get_rows_q4_1);
@ -287,6 +307,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(norm); GGML_METAL_DEL_KERNEL(norm);
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32); GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row); GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32); GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32); GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32); GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
@ -327,7 +348,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
void * ggml_metal_host_malloc(size_t n) { void * ggml_metal_host_malloc(size_t n) {
void * data = NULL; void * data = NULL;
const int result = posix_memalign((void **) &data, getpagesize(), n); const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
if (result != 0) { if (result != 0) {
metal_printf("%s: error: posix_memalign failed\n", __func__); metal_printf("%s: error: posix_memalign failed\n", __func__);
return NULL; return NULL;
@ -401,7 +422,7 @@ bool ggml_metal_add_buffer(
} }
} }
const size_t size_page = getpagesize(); const size_t size_page = sysconf(_SC_PAGESIZE);
size_t size_aligned = size; size_t size_aligned = size;
if ((size_aligned % size_page) != 0) { if ((size_aligned % size_page) != 0) {
@ -454,6 +475,7 @@ bool ggml_metal_add_buffer(
} }
} }
#if TARGET_OS_OSX
metal_printf(", (%8.2f / %8.2f)", metal_printf(", (%8.2f / %8.2f)",
ctx->device.currentAllocatedSize / 1024.0 / 1024.0, ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
@ -463,6 +485,9 @@ bool ggml_metal_add_buffer(
} else { } else {
metal_printf("\n"); metal_printf("\n");
} }
#else
metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
#endif
} }
return true; return true;
@ -750,7 +775,7 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&scale length:sizeof(scale) atIndex:2]; [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
const int64_t n = ggml_nelements(dst); const int64_t n = ggml_nelements(dst)/4;
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
@ -762,7 +787,7 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst); const int64_t n = ggml_nelements(dst)/4;
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
@ -782,7 +807,7 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst); const int64_t n = ggml_nelements(dst)/4;
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
@ -796,13 +821,16 @@ void ggml_metal_graph_compute(
{ {
const int nth = 32; const int nth = 32;
[encoder setComputePipelineState:ctx->pipeline_soft_max]; if (ne00%4 == 0) {
[encoder setComputePipelineState:ctx->pipeline_soft_max_4];
} else {
[encoder setComputePipelineState:ctx->pipeline_soft_max];
}
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;
@ -810,14 +838,23 @@ void ggml_metal_graph_compute(
{ {
const int n_past = ((int32_t *)(dst->op_params))[0]; const int n_past = ((int32_t *)(dst->op_params))[0];
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; if (ne00%8 == 0) {
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8];
} else {
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
}
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
[encoder setBytes:&n_past length:sizeof(int) atIndex:4]; [encoder setBytes:&n_past length:sizeof(int) atIndex:4];
[encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; if (ne00%8 == 0) {
[encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
}
else {
[encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
}
} break; } break;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
{ {
@ -864,6 +901,7 @@ void ggml_metal_graph_compute(
} else { } else {
int nth0 = 32; int nth0 = 32;
int nth1 = 1; int nth1 = 1;
int nrows = 1;
// use custom matrix x vector kernel // use custom matrix x vector kernel
switch (src0t) { switch (src0t) {
@ -873,8 +911,12 @@ void ggml_metal_graph_compute(
nth1 = 1; nth1 = 1;
if (ne11 * ne12 < 4) { if (ne11 * ne12 < 4) {
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
nrows = ne11;
} else { } else {
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
nrows = 4;
} }
} break; } break;
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
@ -995,7 +1037,7 @@ void ggml_metal_graph_compute(
else if (src0t == GGML_TYPE_Q6_K) { else if (src0t == GGML_TYPE_Q6_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else { } else {
int64_t ny = (ne11 + 3)/4; int64_t ny = (ne11 + nrows - 1)/nrows;
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
} }
@ -1141,7 +1183,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21]; [encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22]; [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
} break; } break;
case GGML_OP_DUP: case GGML_OP_DUP:
case GGML_OP_CPY: case GGML_OP_CPY:

View file

@ -63,18 +63,18 @@ kernel void kernel_mul_row(
} }
kernel void kernel_scale( kernel void kernel_scale(
device const float * src0, device const float4 * src0,
device float * dst, device float4 * dst,
constant float & scale, constant float & scale,
uint tpig[[thread_position_in_grid]]) { uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] * scale; dst[tpig] = src0[tpig] * scale;
} }
kernel void kernel_silu( kernel void kernel_silu(
device const float * src0, device const float4 * src0,
device float * dst, device float4 * dst,
uint tpig[[thread_position_in_grid]]) { uint tpig[[thread_position_in_grid]]) {
float x = src0[tpig]; device const float4 & x = src0[tpig];
dst[tpig] = x / (1.0f + exp(-x)); dst[tpig] = x / (1.0f + exp(-x));
} }
@ -89,10 +89,10 @@ constant float GELU_COEF_A = 0.044715f;
constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
kernel void kernel_gelu( kernel void kernel_gelu(
device const float * src0, device const float4 * src0,
device float * dst, device float4 * dst,
uint tpig[[thread_position_in_grid]]) { uint tpig[[thread_position_in_grid]]) {
float x = src0[tpig]; device const float4 & x = src0[tpig];
// BEWARE !!! // BEWARE !!!
// Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs! // Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs!
@ -107,7 +107,6 @@ kernel void kernel_soft_max(
constant int64_t & ne00, constant int64_t & ne00,
constant int64_t & ne01, constant int64_t & ne01,
constant int64_t & ne02, constant int64_t & ne02,
threadgroup float * buf [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]], uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]], uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) { uint3 ntg[[threads_per_threadgroup]]) {
@ -119,64 +118,70 @@ kernel void kernel_soft_max(
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
// parallel max // parallel max
buf[tpitg[0]] = -INFINITY; float lmax = psrc0[tpitg[0]];
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
buf[tpitg[0]] = MAX(buf[tpitg[0]], psrc0[i00]); lmax = MAX(lmax, psrc0[i00]);
} }
const float max = simd_max(lmax);
// reduce
threadgroup_barrier(mem_flags::mem_threadgroup);
for (uint i = ntg[0]/2; i > 0; i /= 2) {
if (tpitg[0] < i) {
buf[tpitg[0]] = MAX(buf[tpitg[0]], buf[tpitg[0] + i]);
}
threadgroup_barrier(mem_flags::mem_threadgroup);
}
//// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
// the loop, and when that is done, buf[0] has the correct (synchronized) value
//if (tpitg[0] == 0) {
// buf[0] = buf[0];
//}
//threadgroup_barrier(mem_flags::mem_threadgroup);
const float max = buf[0];
// parallel sum // parallel sum
buf[tpitg[0]] = 0.0f; float lsum = 0.0f;
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
const float exp_psrc0 = exp(psrc0[i00] - max); const float exp_psrc0 = exp(psrc0[i00] - max);
buf[tpitg[0]] += exp_psrc0; lsum += exp_psrc0;
// Remember the result of exp here. exp is expensive, so we really do not // Remember the result of exp here. exp is expensive, so we really do not
// whish to compute it twice. // whish to compute it twice.
pdst[i00] = exp_psrc0; pdst[i00] = exp_psrc0;
} }
// reduce const float sum = simd_sum(lsum);
threadgroup_barrier(mem_flags::mem_threadgroup);
for (uint i = ntg[0]/2; i > 0; i /= 2) {
if (tpitg[0] < i) {
buf[tpitg[0]] += buf[tpitg[0] + i];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
}
// broadcast - not needed, see above
//// broadcast
//if (tpitg[0] == 0) {
// buf[0] = buf[0];
//}
//threadgroup_barrier(mem_flags::mem_threadgroup);
const float sum = buf[0];
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
pdst[i00] /= sum; pdst[i00] /= sum;
} }
} }
kernel void kernel_soft_max_4(
device const float * src0,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i03 = tgpig[2];
const int64_t i02 = tgpig[1];
const int64_t i01 = tgpig[0];
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
// parallel max
float4 lmax4 = psrc4[tpitg[0]];
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
lmax4 = fmax(lmax4, psrc4[i00]);
}
float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
const float max = simd_max(lmax);
// parallel sum
float4 lsum4 = 0.0f;
for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
const float4 exp_psrc4 = exp(psrc4[i00] - max);
lsum4 += exp_psrc4;
pdst4[i00] = exp_psrc4;
}
float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
const float sum = simd_sum(lsum);
for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
pdst4[i00] /= sum;
}
}
kernel void kernel_diag_mask_inf( kernel void kernel_diag_mask_inf(
device const float * src0, device const float * src0,
device float * dst, device float * dst,
@ -192,6 +197,33 @@ kernel void kernel_diag_mask_inf(
dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY; dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
} else { } else {
dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00]; dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
}
}
kernel void kernel_diag_mask_inf_8(
device const float4 * src0,
device float4 * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int & n_past,
uint3 tpig[[thread_position_in_grid]]) {
const int64_t i = 2*tpig[0];
dst[i+0] = src0[i+0];
dst[i+1] = src0[i+1];
int64_t i4 = 4*i;
const int64_t i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01;
const int64_t i01 = i4/(ne00); i4 -= i01*ne00;
const int64_t i00 = i4;
for (int k = 3; k >= 0; --k) {
if (i00 + 4 + k <= n_past + i01) {
break;
}
dst[i+1][k] = -INFINITY;
if (i00 + k > n_past + i01) {
dst[i][k] = -INFINITY;
}
} }
} }
@ -220,14 +252,10 @@ kernel void kernel_norm(
} }
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
} }
//// broadcast const float mean = sum[0] / ne00;
//if (tpitg == 0) {
// sum[0] /= ne00;
//}
//threadgroup_barrier(mem_flags::mem_threadgroup);
const float mean = sum[0];
// recenter and VARIANCE // recenter and VARIANCE
threadgroup_barrier(mem_flags::mem_threadgroup);
device float * y = dst + tgpig*ne00; device float * y = dst + tgpig*ne00;
sum[tpitg] = 0.0f; sum[tpitg] = 0.0f;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) { for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@ -235,12 +263,6 @@ kernel void kernel_norm(
sum[tpitg] += y[i00] * y[i00]; sum[tpitg] += y[i00] * y[i00];
} }
//// VARIANCE
//// parallel sum
//sum[tpitg] = 0.0f;
//for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
// sum[tpitg] += y[i00] * y[i00];
//}
// reduce // reduce
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
for (uint i = ntg/2; i > 0; i /= 2) { for (uint i = ntg/2; i > 0; i /= 2) {
@ -249,12 +271,7 @@ kernel void kernel_norm(
} }
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
} }
//// broadcast const float variance = sum[0] / ne00;
//if (tpitg == 0) {
// sum[0] /= ne00;
//}
//threadgroup_barrier(mem_flags::mem_threadgroup);
const float variance = sum[0];
const float scale = 1.0f/sqrt(variance + eps); const float scale = 1.0f/sqrt(variance + eps);
for (int i00 = tpitg; i00 < ne00; i00 += ntg) { for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@ -262,7 +279,6 @@ kernel void kernel_norm(
} }
} }
kernel void kernel_rms_norm( kernel void kernel_rms_norm(
device const void * src0, device const void * src0,
device float * dst, device float * dst,
@ -630,7 +646,49 @@ kernel void kernel_mul_mat_f16_f32(
} }
} }
} }
}
// Assumes row size (ne00) is a multiple of 4
kernel void kernel_mul_mat_f16_f32_l4(
device const char * src0,
device const char * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]]) {
const int nrows = ne11;
const int64_t r0 = tgpig.x;
const int64_t im = tgpig.z;
device const half4 * x4 = (device const half4 *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
for (int r1 = 0; r1 < nrows; ++r1) {
device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12);
float sumf = 0;
for (int i = tiisg; i < ne00/4; i += 32) {
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
}
float all_sum = simd_sum(sumf);
if (tiisg == 0) {
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
}
}
} }
kernel void kernel_alibi_f32( kernel void kernel_alibi_f32(
@ -699,25 +757,27 @@ kernel void kernel_rope(
constant int & mode, constant int & mode,
constant float & freq_base, constant float & freq_base,
constant float & freq_scale, constant float & freq_scale,
uint3 tpig[[thread_position_in_grid]]) { uint tiitg[[thread_index_in_threadgroup]],
const int64_t i3 = tpig[2]; uint3 tptg[[threads_per_threadgroup]],
const int64_t i2 = tpig[1]; uint3 tgpig[[threadgroup_position_in_grid]]) {
const int64_t i1 = tpig[0]; const int64_t i3 = tgpig[2];
const int64_t i2 = tgpig[1];
const int64_t i1 = tgpig[0];
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
const float theta_scale = pow(freq_base, -2.0f/n_dims);
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
float theta = freq_scale * (float)p; const float theta_0 = freq_scale * (float)p;
const float inv_ndims = -1.f/n_dims;
if (!is_neox) { if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) { for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
const float cos_theta = cos(theta); const float cos_theta = cos(theta);
const float sin_theta = sin(theta); const float sin_theta = sin(theta);
theta *= theta_scale;
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
@ -729,12 +789,12 @@ kernel void kernel_rope(
} }
} else { } else {
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
for (int64_t ic = 0; ic < n_dims; ic += 2) { for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
const float cos_theta = cos(theta); const float cos_theta = cos(theta);
const float sin_theta = sin(theta); const float sin_theta = sin(theta);
theta *= theta_scale;
const int64_t i0 = ib*n_dims + ic/2; const int64_t i0 = ib*n_dims + ic/2;
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
@ -1138,31 +1198,40 @@ kernel void kernel_mul_mat_q3_K_f32(
device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb + offset0; device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb + offset0;
device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1;
float yl[16]; float yl[32];
const uint16_t kmask1 = 0x0303; const uint16_t kmask1 = 0x3030;
const uint16_t kmask2 = 0x0f0f; const uint16_t kmask2 = 0x0f0f;
const int tid = tiisg/2; const int tid = tiisg/4;
const int ix = tiisg%2; const int ix = tiisg%4;
const int ip = tid/8; // 0 or 1 const int ip = tid/4; // 0 or 1
const int il = tid/2 - 4*ip; // 0...3 const int il = 2*((tid%4)/2); // 0 or 2
const int ir = tid%2; const int ir = tid%2;
const int n = 8; const int n = 8;
const int l0 = n*ir; const int l0 = n*ir;
const uint16_t m1 = 1 << (4*ip + il); // One would think that the Metal compiler would figure out that ip and il can only have
const uint16_t m2 = m1 << 8; // 4 possible states, and optimize accordingly. Well, no. It needs help, and we do it
// with these two tales.
//
// Possible masks for the high bit
const ushort4 mm[4] = {{0x0001, 0x0100, 0x0002, 0x0200}, // ip = 0, il = 0
{0x0004, 0x0400, 0x0008, 0x0800}, // ip = 0, il = 2
{0x0010, 0x1000, 0x0020, 0x2000}, // ip = 1, il = 0
{0x0040, 0x4000, 0x0080, 0x8000}}; // ip = 1, il = 2
// Possible masks for the low 2 bits
const int4 qm[2] = {{0x0003, 0x0300, 0x000c, 0x0c00}, {0x0030, 0x3000, 0x00c0, 0xc000}};
const ushort4 hm = mm[2*ip + il/2];
const int shift = 2*il; const int shift = 2*il;
const uint16_t qm1 = 0x0003 << shift; const float v1 = il == 0 ? 4.f : 64.f;
const uint16_t qm2 = 0x0300 << shift; const float v2 = 4.f * v1;
const int32_t v1 = 4 << shift;
const int32_t v2 = 1024 << shift;
const uint16_t s_shift1 = 4*ip; const uint16_t s_shift1 = 4*ip;
const uint16_t s_shift2 = s_shift1 + 2*(il/2); const uint16_t s_shift2 = s_shift1 + il;
const int ik = 4 + (il%2);
const int q_offset = 32*ip + l0; const int q_offset = 32*ip + l0;
const int y_offset = 128*ip + 32*il + l0; const int y_offset = 128*ip + 32*il + l0;
@ -1171,12 +1240,19 @@ kernel void kernel_mul_mat_q3_K_f32(
device const float * y1 = yy + ix*QK_K + y_offset; device const float * y1 = yy + ix*QK_K + y_offset;
float sumf1[2] = {0.f}, sumf2[2] = {0.f}; uint32_t scales32, aux32;
for (int i = ix; i < nb; i += 2) { thread uint16_t * scales16 = (thread uint16_t *)&scales32;
thread const int8_t * scales = (thread const int8_t *)&scales32;
float sumf1[2] = {0.f};
float sumf2[2] = {0.f};
for (int i = ix; i < nb; i += 4) {
for (int l = 0; l < 8; ++l) { for (int l = 0; l < 8; ++l) {
yl[l+0] = y1[l+ 0]; yl[l+ 0] = y1[l+ 0];
yl[l+8] = y1[l+16]; yl[l+ 8] = y1[l+16];
yl[l+16] = y1[l+32];
yl[l+24] = y1[l+48];
} }
device const uint16_t * q = (device const uint16_t *)(x[i].qs + q_offset); device const uint16_t * q = (device const uint16_t *)(x[i].qs + q_offset);
@ -1187,27 +1263,43 @@ kernel void kernel_mul_mat_q3_K_f32(
for (int row = 0; row < 2; ++row) { for (int row = 0; row < 2; ++row) {
const float d_all = (float)dh[0]; const float d_all = (float)dh[0];
const char2 scales = as_type<char2>((uint16_t)(((a[il] >> s_shift1) & kmask2) | (((a[ik] >> s_shift2) & kmask1) << 4)));
float s1 = 0, s2 = 0; scales16[0] = a[4];
for (int l = 0; l < n; l += 2) { scales16[1] = a[5];
const uint16_t qs = q[l/2]; aux32 = ((scales32 >> s_shift2) << 4) & 0x30303030;
s1 += yl[l+0] * ((int32_t)(qs & qm1) - ((h[l/2] & m1) ? 0 : v1)); scales16[0] = a[il+0];
s2 += yl[l+1] * ((int32_t)(qs & qm2) - ((h[l/2] & m2) ? 0 : v2)); scales16[1] = a[il+1];
} scales32 = ((scales32 >> s_shift1) & 0x0f0f0f0f) | aux32;
float d = d_all * (s1 + 1.f/256.f * s2);
sumf1[row] += d * scales[0];
sumf2[row] += d;
s1 = s2 = 0; float s1 = 0, s2 = 0, s3 = 0, s4 = 0, s5 = 0, s6 = 0;
for (int l = 0; l < n; l += 2) { for (int l = 0; l < n; l += 2) {
const uint16_t qs = q[l/2+8]; const int32_t qs = q[l/2];
s1 += yl[l+8] * ((int32_t)(qs & qm1) - ((h[l/2+8] & m1) ? 0 : v1)); s1 += yl[l+0] * (qs & qm[il/2][0]);
s2 += yl[l+9] * ((int32_t)(qs & qm2) - ((h[l/2+8] & m2) ? 0 : v2)); s2 += yl[l+1] * (qs & qm[il/2][1]);
s3 += ((h[l/2] & hm[0]) ? 0.f : yl[l+0]) + ((h[l/2] & hm[1]) ? 0.f : yl[l+1]);
s4 += yl[l+16] * (qs & qm[il/2][2]);
s5 += yl[l+17] * (qs & qm[il/2][3]);
s6 += ((h[l/2] & hm[2]) ? 0.f : yl[l+16]) + ((h[l/2] & hm[3]) ? 0.f : yl[l+17]);
} }
d = d_all * (s1 + 1.f/256.f * s2); float d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1);
sumf1[row] += d * scales[1]; float d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2);
sumf2[row] += d; sumf1[row] += d1 * (scales[0] - 32);
sumf2[row] += d2 * (scales[2] - 32);
s1 = s2 = s3 = s4 = s5 = s6 = 0;
for (int l = 0; l < n; l += 2) {
const int32_t qs = q[l/2+8];
s1 += yl[l+8] * (qs & qm[il/2][0]);
s2 += yl[l+9] * (qs & qm[il/2][1]);
s3 += ((h[l/2+8] & hm[0]) ? 0.f : yl[l+8]) + ((h[l/2+8] & hm[1]) ? 0.f : yl[l+9]);
s4 += yl[l+24] * (qs & qm[il/2][2]);
s5 += yl[l+25] * (qs & qm[il/2][3]);
s6 += ((h[l/2+8] & hm[2]) ? 0.f : yl[l+24]) + ((h[l/2+8] & hm[3]) ? 0.f : yl[l+25]);
}
d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1);
d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2);
sumf1[row] += d1 * (scales[1] - 32);
sumf2[row] += d2 * (scales[3] - 32);
q += step; q += step;
h += step; h += step;
@ -1216,17 +1308,20 @@ kernel void kernel_mul_mat_q3_K_f32(
} }
y1 += 2 * QK_K; y1 += 4 * QK_K;
} }
for (int row = 0; row < 2; ++row) { for (int row = 0; row < 2; ++row) {
const float sumf = (sumf1[row] - 32.f*sumf2[row]) / (1 << shift); const float sumf = (sumf1[row] + 0.25f * sumf2[row]) / (1 << shift);
const float tot = simd_sum(sumf); sumf1[row] = simd_sum(sumf);
if (tiisg == 0) { }
dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = tot; if (tiisg == 0) {
for (int row = 0; row < 2; ++row) {
dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row];
} }
} }
} }
#else #else
kernel void kernel_mul_mat_q3_K_f32( kernel void kernel_mul_mat_q3_K_f32(
@ -1579,17 +1674,25 @@ kernel void kernel_mul_mat_q5_K_f32(
sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2); sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2);
sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2); sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2);
float4 acc = {0.f, 0.f, 0.f, 0.f}; float4 acc1 = {0.f};
float4 acc2 = {0.f};
for (int l = 0; l < n; ++l) { for (int l = 0; l < n; ++l) {
uint8_t h = qh[l]; uint8_t h = qh[l];
acc[0] += yl[l+0] * ((uint16_t)(q1[l] & 0x0F) + (h & hm1 ? 16 : 0)); acc1[0] += yl[l+0] * (q1[l] & 0x0F);
acc[1] += yl[l+8] * ((uint16_t)(q1[l] & 0xF0) + (h & hm2 ? 256 : 0)); acc1[1] += yl[l+8] * (q1[l] & 0xF0);
acc[2] += yh[l+0] * ((uint16_t)(q2[l] & 0x0F) + (h & hm3 ? 16 : 0)); acc1[2] += yh[l+0] * (q2[l] & 0x0F);
acc[3] += yh[l+8] * ((uint16_t)(q2[l] & 0xF0) + (h & hm4 ? 256 : 0)); acc1[3] += yh[l+8] * (q2[l] & 0xF0);
acc2[0] += h & hm1 ? yl[l+0] : 0.f;
acc2[1] += h & hm2 ? yl[l+8] : 0.f;
acc2[2] += h & hm3 ? yh[l+0] : 0.f;
acc2[3] += h & hm4 ? yh[l+8] : 0.f;
} }
const float dall = dh[0]; const float dall = dh[0];
const float dmin = dh[1]; const float dmin = dh[1];
sumf[row] += dall * (acc[0] * sc8[0] + acc[1] * sc8[1] * 1.f/16.f + acc[2] * sc8[4] + acc[3] * sc8[5] * 1.f/16.f) - sumf[row] += dall * (sc8[0] * (acc1[0] + 16.f*acc2[0]) +
sc8[1] * (acc1[1]/16.f + 16.f*acc2[1]) +
sc8[4] * (acc1[2] + 16.f*acc2[2]) +
sc8[5] * (acc1[3]/16.f + 16.f*acc2[3])) -
dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]); dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
q1 += step; q1 += step;
@ -1772,29 +1875,34 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg)
template <typename type4x4> template <typename type4x4>
void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) { void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
device const uint16_t * qs = ((device const uint16_t *)xb + 1); device const uint16_t * qs = ((device const uint16_t *)xb + 1);
const half d = il ? (xb->d / 16.h) : xb->d; const float d1 = il ? (xb->d / 16.h) : xb->d;
const half m = il ? ( -8.h * 16.h) : -8.h; const float d2 = d1 / 256.f;
const float md = -8.h * xb->d;
const ushort mask0 = il ? 0x00F0 : 0x000F; const ushort mask0 = il ? 0x00F0 : 0x000F;
const ushort mask1 = il ? 0xF000 : 0x0F00; const ushort mask1 = mask0 << 8;
for (int i=0;i<8;i++) { for (int i=0;i<8;i++) {
reg[i/2][2*(i%2)] = (((qs[i] & mask0) ) + m) * d; reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) + m) * d; reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
} }
} }
template <typename type4x4> template <typename type4x4>
void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) { void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
device const uint16_t * qs = ((device const uint16_t *)xb + 2); device const uint16_t * qs = ((device const uint16_t *)xb + 2);
const half d = il ? (xb->d / 16.h) : xb->d; const float d1 = il ? (xb->d / 16.h) : xb->d;
const half m = xb->m; const float d2 = d1 / 256.f;
const float m = xb->m;
const ushort mask0 = il ? 0x00F0 : 0x000F; const ushort mask0 = il ? 0x00F0 : 0x000F;
const ushort mask1 = il ? 0xF000 : 0x0F00; const ushort mask1 = mask0 << 8;
for (int i=0;i<8;i++) { for (int i=0;i<8;i++) {
reg[i/2][2*(i%2)] = (((qs[i] & mask0) ) * d) + m; reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m;
reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) * d) + m; reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
} }
} }
@ -1830,7 +1938,7 @@ void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg
template <typename type4x4> template <typename type4x4>
void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) { void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) {
const float d_all = (float)(xb->d); const half d_all = xb->d;
device const uint8_t * q = (device const uint8_t *)xb->qs; device const uint8_t * q = (device const uint8_t *)xb->qs;
device const uint8_t * h = (device const uint8_t *)xb->hmask; device const uint8_t * h = (device const uint8_t *)xb->hmask;
device const int8_t * scales = (device const int8_t *)xb->scales; device const int8_t * scales = (device const int8_t *)xb->scales;
@ -1843,17 +1951,20 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
((il/4)>0 ? 12 : 3); ((il/4)>0 ? 12 : 3);
uint16_t kmask2 = il/8 ? 0xF0 : 0x0F; uint16_t kmask2 = il/8 ? 0xF0 : 0x0F;
uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4]; uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) : \ int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
(scale_2&kmask2) | ((scale_1&kmask1) << 4); : (scale_2&kmask2) | ((scale_1&kmask1) << 4);
float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f); half dl = il<8 ? d_all * (dl_int - 32.h) : d_all * (dl_int / 16.h - 32.h);
const half ml = 4.h * dl;
il = (il/2)%4; il = (il/2) & 3;
float coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); const uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
dl *= coef;
for (int i = 0; i < 16; ++i) { for (int i = 0; i < 16; ++i) {
reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i] & m) ? 0 : 4.f/coef)); reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
} }
#else #else
float kcoef = il&1 ? 1.f/16.f : 1.f; float kcoef = il&1 ? 1.f/16.f : 1.f;
uint16_t kmask = il&1 ? 0xF0 : 0x0F; uint16_t kmask = il&1 ? 0xF0 : 0x0F;
@ -1867,31 +1978,37 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
#endif #endif
} }
static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)}
: uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))};
}
template <typename type4x4> template <typename type4x4>
void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) { void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
device const uint8_t * q = xb->qs; device const uchar * q = xb->qs;
#if QK_K == 256 #if QK_K == 256
const float d = (float)(xb->d);
const float min = (float)(xb->dmin);
short is = (il/4) * 2; short is = (il/4) * 2;
q = q + (il/4) * 32 + 16 * (il&1); q = q + (il/4) * 32 + 16 * (il&1);
il = il%4; il = il & 3;
const uchar4 sc = get_scale_min_k4(is, xb->scales); const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
const float dl = il<2 ? d * sc[0] : d * sc[2]/16.h; const half d = il < 2 ? xb->d : xb->d / 16.h;
const float ml = il<2 ? min * sc[1] : min * sc[3]; const half min = xb->dmin;
const half dl = d * sc[0];
const half ml = min * sc[1];
#else #else
q = q + 16 * (il&1); q = q + 16 * (il&1);
device const uint8_t * s = xb->scales; device const uint8_t * s = xb->scales;
device const half2 * dh = (device const half2 *)xb->d; device const half2 * dh = (device const half2 *)xb->d;
const float2 d = (float2)dh[0]; const float2 d = (float2)dh[0];
const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h; const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h;
const float ml = il<2 ? d[1] * (s[0]>>4) : d[1 ]* (s[1]>>4); const float ml = il<2 ? d[1] * (s[0]>>4) : d[1] * (s[1]>>4);
#endif #endif
const ushort mask = il<2 ? 0x0F : 0xF0; const ushort mask = il<2 ? 0x0F : 0xF0;
for (int i = 0; i < 16; ++i) { for (int i = 0; i < 16; ++i) {
reg[i/4][i%4] = dl * (q[i] & mask) - ml; reg[i/4][i%4] = dl * (q[i] & mask) - ml;
} }
} }
template <typename type4x4> template <typename type4x4>
@ -1900,19 +2017,19 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
device const uint8_t * qh = xb->qh; device const uint8_t * qh = xb->qh;
#if QK_K == 256 #if QK_K == 256
const float d = (float)(xb->d);
const float min = (float)(xb->dmin);
short is = (il/4) * 2; short is = (il/4) * 2;
q = q + 32 * (il/4) + 16 * (il&1); q = q + 32 * (il/4) + 16 * (il&1);
qh = qh + 16 * (il&1); qh = qh + 16 * (il&1);
uint8_t ul = 1 << (il/2); uint8_t ul = 1 << (il/2);
il = il%4; il = il & 3;
const uchar4 sc = get_scale_min_k4(is, xb->scales); const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
const float dl = il<2 ? d * sc[0] : d * sc[2]/16.h; const half d = il < 2 ? xb->d : xb->d / 16.h;
const float ml = il<2 ? min * sc[1] : min * sc[3]; const half min = xb->dmin;
const half dl = d * sc[0];
const half ml = min * sc[1];
const ushort mask = il<2 ? 0x0F : 0xF0; const ushort mask = il<2 ? 0x0F : 0xF0;
const float qh_val = il<2 ? 16.f : 256.f; const half qh_val = il<2 ? 16.h : 256.h;
for (int i = 0; i < 16; ++i) { for (int i = 0; i < 16; ++i) {
reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml; reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
} }
@ -1931,7 +2048,7 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
template <typename type4x4> template <typename type4x4>
void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) { void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
const float d_all = (float)(xb->d); const half d_all = xb->d;
device const uint8_t * ql = (device const uint8_t *)xb->ql; device const uint8_t * ql = (device const uint8_t *)xb->ql;
device const uint8_t * qh = (device const uint8_t *)xb->qh; device const uint8_t * qh = (device const uint8_t *)xb->qh;
device const int8_t * scales = (device const int8_t *)xb->scales; device const int8_t * scales = (device const int8_t *)xb->scales;
@ -1939,19 +2056,21 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg
#if QK_K == 256 #if QK_K == 256
ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1); ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
qh = qh + 32*(il/8) + 16*(il&1); qh = qh + 32*(il/8) + 16*(il&1);
float sc = scales[(il%2) + 2 * ((il/2))]; half sc = scales[(il%2) + 2 * ((il/2))];
il = (il/2)%4; il = (il/2) & 3;
#else #else
ql = ql + 16 * (il&1); ql = ql + 16 * (il&1);
float sc = scales[il]; half sc = scales[il];
#endif #endif
const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
const half coef = il>1 ? 1.f/16.h : 1.h;
const half ml = d_all * sc * 32.h;
const half dl = d_all * sc * coef;
for (int i = 0; i < 16; ++i) { for (int i = 0; i < 16; ++i) {
uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
uint16_t kmask2 = il>1 ? 0xF0 : 0x0F; : ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));
const float coef = il>1 ? 1.f/16.f : 1.f; reg[i/4][i%4] = dl * q - ml;
float q = il&1 ? ((ql[i]&kmask2)|((qh[i]&kmask1)<<2)) - 32.f/coef : \
((ql[i]&kmask2)|((qh[i]&kmask1)<<4)) - 32.f/coef;
reg[i/4][i%4] = d_all * sc * q * coef;
} }
} }

View file

@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
return; return;
} }
cl_mem mem = (cl_mem)tensor->data; cl_mem mem = (cl_mem)tensor->extra;
clReleaseMemObject(mem); clReleaseMemObject(mem);
} }
@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
size_t d_size; size_t d_size;
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0 cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted. cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
@ -1491,7 +1491,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
size_t d_size; size_t d_size;
cl_mem d_X; cl_mem d_X;
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
d_X = (cl_mem) src0->data; d_X = (cl_mem) src0->extra;
} else { } else {
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size); d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
} }
@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
size_t d_size; size_t d_size;
cl_mem d_X; cl_mem d_X;
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
d_X = (cl_mem) src0->data; d_X = (cl_mem) src0->extra;
} else { } else {
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size); d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
} }
@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
events.emplace_back(); events.emplace_back();
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++)); CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
} else if (src0->backend == GGML_BACKEND_GPU) { } else if (src0->backend == GGML_BACKEND_GPU) {
d_Q = (cl_mem) src0->data; d_Q = (cl_mem) src0->extra;
} else { } else {
GGML_ASSERT(false); GGML_ASSERT(false);
} }
@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
CL_CHECK(clFinish(queue)); CL_CHECK(clFinish(queue));
tensor->data = dst; tensor->extra = dst;
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
} }

40
ggml.c
View file

@ -1,4 +1,3 @@
#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
#include "ggml.h" #include "ggml.h"
@ -47,6 +46,10 @@
// disable "possible loss of data" to avoid hundreds of casts // disable "possible loss of data" to avoid hundreds of casts
// we should just be careful :) // we should just be careful :)
#pragma warning(disable: 4244 4267) #pragma warning(disable: 4244 4267)
// disable POSIX deprecation warnigns
// these functions are never going away, anyway
#pragma warning(disable: 4996)
#endif #endif
#if defined(_WIN32) #if defined(_WIN32)
@ -103,6 +106,9 @@ typedef void * thread_ret_t;
#include <sys/stat.h> #include <sys/stat.h>
#include <unistd.h> #include <unistd.h>
#endif
#ifdef GGML_USE_CPU_HBM
#include <hbwmalloc.h>
#endif #endif
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@ -192,9 +198,15 @@ typedef void * thread_ret_t;
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
#else #else
inline static void * ggml_aligned_malloc(size_t size) { inline static void * ggml_aligned_malloc(size_t size) {
if (size == 0) {
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
return NULL;
}
void * aligned_memory = NULL; void * aligned_memory = NULL;
#ifdef GGML_USE_METAL #ifdef GGML_USE_CPU_HBM
int result = posix_memalign(&aligned_memory, getpagesize(), size); int result = hbw_posix_memalign(&aligned_memory, 16, size);
#elif GGML_USE_METAL
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
#else #else
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size); int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
#endif #endif
@ -215,8 +227,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
return aligned_memory; return aligned_memory;
} }
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size) #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
#ifdef GGML_USE_CPU_HBM
#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
#else
#define GGML_ALIGNED_FREE(ptr) free(ptr) #define GGML_ALIGNED_FREE(ptr) free(ptr)
#endif #endif
#endif
#define UNUSED GGML_UNUSED #define UNUSED GGML_UNUSED
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
@ -294,12 +310,14 @@ typedef double ggml_float;
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
#include <intrin.h> #include <intrin.h>
#else #else
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
#if !defined(__riscv) #if !defined(__riscv)
#include <immintrin.h> #include <immintrin.h>
#endif #endif
#endif #endif
#endif #endif
#endif #endif
#endif
#ifdef __riscv_v_intrinsic #ifdef __riscv_v_intrinsic
#include <riscv_vector.h> #include <riscv_vector.h>
@ -4566,6 +4584,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
return NULL; return NULL;
} }
// allow to call ggml_init with 0 size
if (params.mem_size == 0) {
params.mem_size = GGML_MEM_ALIGN;
}
const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN); const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
*ctx = (struct ggml_context) { *ctx = (struct ggml_context) {
@ -4768,7 +4791,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
size_t obj_alloc_size = 0; size_t obj_alloc_size = 0;
if (view_src == NULL && ctx->no_alloc == false) { if (view_src == NULL && !ctx->no_alloc) {
if (ctx->scratch.data != NULL) { if (ctx->scratch.data != NULL) {
// allocate tensor data in the scratch buffer // allocate tensor data in the scratch buffer
if (ctx->scratch.offs + data_size > ctx->scratch.size) { if (ctx->scratch.offs + data_size > ctx->scratch.size) {
@ -5469,7 +5492,7 @@ static struct ggml_tensor * ggml_mul_impl(
} }
if (inplace) { if (inplace) {
GGML_ASSERT(is_node == false); GGML_ASSERT(!is_node);
} }
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@ -5512,7 +5535,7 @@ static struct ggml_tensor * ggml_div_impl(
} }
if (inplace) { if (inplace) {
GGML_ASSERT(is_node == false); GGML_ASSERT(!is_node);
} }
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@ -18854,7 +18877,6 @@ static enum ggml_opt_result linesearch_backtracking(
// strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
return count; return count;
} }
return count;
} }
} }
@ -19957,7 +19979,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
struct ggml_tensor * data = NULL; struct ggml_tensor * data = NULL;
if (params.no_alloc == false) { if (!params.no_alloc) {
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size); data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
ok = ok && data != NULL; ok = ok && data != NULL;
@ -19998,7 +20020,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
} }
// point the data member to the appropriate location in the binary blob using the tensor infos // point the data member to the appropriate location in the binary blob using the tensor infos
if (params.no_alloc == false) { if (!params.no_alloc) {
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
} }

34
grammars/json_arr.gbnf Normal file
View file

@ -0,0 +1,34 @@
# This is the same as json.gbnf but we restrict whitespaces at the end of the root array
# Useful for generating JSON arrays
root ::= arr
value ::= object | array | string | number | ("true" | "false" | "null") ws
arr ::=
"[\n" ws (
value
(",\n" ws value)*
)? "]"
object ::=
"{" ws (
string ":" ws value
("," ws string ":" ws value)*
)? "}" ws
array ::=
"[" ws (
value
("," ws value)*
)? "]" ws
string ::=
"\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
# Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= ([ \t\n] ws)?

View file

@ -83,7 +83,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
float ax = fabsf(x[i]); float ax = fabsf(x[i]);
if (ax > amax) { amax = ax; max = x[i]; } if (ax > amax) { amax = ax; max = x[i]; }
} }
if (!amax) { // all zero if (amax < 1e-30f) { // all zero
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
L[i] = 0; L[i] = 0;
} }
@ -1086,6 +1086,13 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
} }
if (!max_abs_scale) {
memset(&y[i], 0, sizeof(block_q6_K));
y[i].d = ggml_fp32_to_fp16(0.f);
x += QK_K;
continue;
}
float iscale = -128.f/max_scale; float iscale = -128.f/max_scale;
y[i].d = ggml_fp32_to_fp16(1/iscale); y[i].d = ggml_fp32_to_fp16(1/iscale);
for (int ib = 0; ib < QK_K/16; ++ib) { for (int ib = 0; ib < QK_K/16; ++ib) {

146
llama.cpp
View file

@ -1,8 +1,3 @@
// Defines fileno on msys:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "llama.h" #include "llama.h"
#include "ggml.h" #include "ggml.h"
@ -126,6 +121,9 @@ void replace_all(std::string & s, const std::string & search, const std::string
} }
s = std::move(result); s = std::move(result);
} }
#ifdef GGML_USE_CPU_HBM
#include <hbwmalloc.h>
#endif
static void zeros(std::ofstream & file, size_t n) { static void zeros(std::ofstream & file, size_t n) {
char zero = 0; char zero = 0;
@ -450,6 +448,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
#elif GGML_USE_METAL #elif GGML_USE_METAL
# define llama_host_malloc(n) ggml_metal_host_malloc(n) # define llama_host_malloc(n) ggml_metal_host_malloc(n)
# define llama_host_free(data) ggml_metal_host_free(data) # define llama_host_free(data) ggml_metal_host_free(data)
#elif GGML_USE_CPU_HBM
# define llama_host_malloc(n) hbw_malloc(n)
# define llama_host_free(data) if (data != NULL) hbw_free(data)
#else #else
# define llama_host_malloc(n) malloc(n) # define llama_host_malloc(n) malloc(n)
# define llama_host_free(data) free(data) # define llama_host_free(data) free(data)
@ -606,16 +607,16 @@ struct llama_mmap {
if (prefetch > 0) { if (prefetch > 0) {
// Advise the kernel to preload the mapped memory // Advise the kernel to preload the mapped memory
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
strerror(errno)); strerror(errno));
} }
} }
if (numa) { if (numa) {
// advise the kernel not to use readahead // advise the kernel not to use readahead
// (because the next page might not belong on the same node) // (because the next page might not belong on the same node)
if (madvise(addr, file->size, MADV_RANDOM)) { if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
strerror(errno)); strerror(errno));
} }
} }
@ -1489,7 +1490,11 @@ struct llama_model_loader {
// allocate temp buffer if not using mmap // allocate temp buffer if not using mmap
if (!use_mmap && cur->data == NULL) { if (!use_mmap && cur->data == NULL) {
GGML_ASSERT(cur->backend != GGML_BACKEND_CPU); GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
cur->data = malloc(ggml_nbytes(cur)); #ifdef GGML_USE_CPU_HBM
cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
#else
cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
#endif
} }
load_data_for(cur); load_data_for(cur);
@ -2942,7 +2947,12 @@ static bool llama_eval_internal(
// for big prompts, if BLAS is enabled, it is better to use only one thread // for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
// with the BLAS calls. need a better solution
if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
n_threads = std::min(4, n_threads);
}
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
@ -3047,33 +3057,10 @@ static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL; return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
} }
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
}
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
}
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) { static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE; return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
} }
static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(llama_is_control_token(vocab, id));
return id == vocab.special_bos_id;
}
static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) {
GGML_ASSERT(llama_is_control_token(vocab, id));
return id == vocab.special_eos_id;
}
static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id));
return id == vocab.special_pad_id;
}
static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) { static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(llama_is_byte_token(vocab, id)); GGML_ASSERT(llama_is_byte_token(vocab, id));
const auto& token_data = vocab.id_to_token.at(id); const auto& token_data = vocab.id_to_token.at(id);
@ -3850,6 +3837,25 @@ void llama_grammar_free(struct llama_grammar * grammar) {
delete grammar; delete grammar;
} }
struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
// redirect elements in stacks to point to new rules
for (size_t is = 0; is < result->stacks.size(); is++) {
for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
result->stacks[is][ie] = &result->rules[ir0][ir1];
}
}
}
}
}
return result;
}
// //
// sampling // sampling
// //
@ -4776,9 +4782,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
std::vector<std::thread> workers; std::vector<std::thread> workers;
std::mutex mutex; std::mutex mutex;
#ifdef GGML_USE_K_QUANTS
auto use_more_bits = [] (int i_layer, int num_layers) -> bool { auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2; return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
}; };
#endif
int idx = 0; int idx = 0;
@ -5340,7 +5348,7 @@ struct llama_context_params llama_context_default_params() {
/*.seed =*/ LLAMA_DEFAULT_SEED, /*.seed =*/ LLAMA_DEFAULT_SEED,
/*.n_ctx =*/ 512, /*.n_ctx =*/ 512,
/*.n_batch =*/ 512, /*.n_batch =*/ 512,
/*.gpu_layers =*/ 0, /*.n_gpu_layers =*/ 0,
/*.main_gpu =*/ 0, /*.main_gpu =*/ 0,
/*.tensor_split =*/ nullptr, /*.tensor_split =*/ nullptr,
/*.rope_freq_base =*/ 10000.0f, /*.rope_freq_base =*/ 10000.0f,
@ -5357,6 +5365,10 @@ struct llama_context_params llama_context_default_params() {
/*.embedding =*/ false, /*.embedding =*/ false,
}; };
#ifdef GGML_USE_METAL
result.n_gpu_layers = 1;
#endif
return result; return result;
} }
@ -5549,43 +5561,43 @@ struct llama_context * llama_new_context_with_model(
} }
#endif #endif
} }
}
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
if (params.n_gpu_layers > 0) { if (params.n_gpu_layers > 0) {
// this allocates all Metal resources and memory buffers // this allocates all Metal resources and memory buffers
void * data_ptr = NULL; void * data_ptr = NULL;
size_t data_size = 0; size_t data_size = 0;
if (params.use_mmap) { if (params.use_mmap) {
data_ptr = ctx->model.mapping->addr; data_ptr = ctx->model.mapping->addr;
data_size = ctx->model.mapping->size; data_size = ctx->model.mapping->size;
} else { } else {
data_ptr = ggml_get_mem_buffer(ctx->model.ctx); data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
data_size = ggml_get_mem_size (ctx->model.ctx); data_size = ggml_get_mem_size (ctx->model.ctx);
} }
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
#define LLAMA_METAL_CHECK_BUF(result) \ #define LLAMA_METAL_CHECK_BUF(result) \
if (!(result)) { \ if (!(result)) { \
LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \ LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
llama_free(ctx); \ llama_free(ctx); \
return NULL; \ return NULL; \
} }
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
#undef LLAMA_METAL_CHECK_BUF #undef LLAMA_METAL_CHECK_BUF
} }
#endif #endif
}
#ifdef GGML_USE_MPI #ifdef GGML_USE_MPI
ctx->ctx_mpi = ggml_mpi_init(); ctx->ctx_mpi = ggml_mpi_init();
@ -5621,15 +5633,19 @@ void llama_free(struct llama_context * ctx) {
} }
int llama_n_vocab(const struct llama_context * ctx) { int llama_n_vocab(const struct llama_context * ctx) {
return ctx->model.vocab.id_to_token.size(); return llama_model_n_vocab(&ctx->model);
} }
int llama_n_ctx(const struct llama_context * ctx) { int llama_n_ctx(const struct llama_context * ctx) {
return ctx->model.hparams.n_ctx; return llama_model_n_ctx(&ctx->model);
}
int llama_n_ctx_train(const struct llama_context * ctx) {
return llama_model_n_ctx_train(&ctx->model);
} }
int llama_n_embd(const struct llama_context * ctx) { int llama_n_embd(const struct llama_context * ctx) {
return ctx->model.hparams.n_embd; return llama_model_n_embd(&ctx->model);
} }
enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) { enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
@ -5644,6 +5660,10 @@ int llama_model_n_ctx(const struct llama_model * model) {
return model->hparams.n_ctx; return model->hparams.n_ctx;
} }
int llama_model_n_ctx_train(const struct llama_model * model) {
return model->hparams.n_ctx_train;
}
int llama_model_n_embd(const struct llama_model * model) { int llama_model_n_embd(const struct llama_model * model) {
return model->hparams.n_embd; return model->hparams.n_embd;
} }
@ -5919,7 +5939,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
rng_ss.str(std::string(&rng_buf[0], rng_size)); rng_ss.str(std::string(&rng_buf[0], rng_size));
rng_ss >> ctx->rng; rng_ss >> ctx->rng;
GGML_ASSERT(rng_ss.fail() == false); GGML_ASSERT(!rng_ss.fail());
} }
// set logits // set logits

16
llama.h
View file

@ -245,15 +245,17 @@ extern "C" {
LLAMA_API bool llama_mmap_supported (void); LLAMA_API bool llama_mmap_supported (void);
LLAMA_API bool llama_mlock_supported(void); LLAMA_API bool llama_mlock_supported(void);
LLAMA_API int llama_n_vocab(const struct llama_context * ctx); LLAMA_API int llama_n_vocab (const struct llama_context * ctx);
LLAMA_API int llama_n_ctx (const struct llama_context * ctx); LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
LLAMA_API int llama_n_embd (const struct llama_context * ctx); LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx); LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
LLAMA_API int llama_model_n_vocab(const struct llama_model * model); LLAMA_API int llama_model_n_vocab (const struct llama_model * model);
LLAMA_API int llama_model_n_ctx (const struct llama_model * model); LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
LLAMA_API int llama_model_n_embd (const struct llama_model * model); LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model);
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
// Get a string describing the model type // Get a string describing the model type
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size); LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
@ -410,6 +412,8 @@ extern "C" {
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar); LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
// //
// Sampling functions // Sampling functions
// //

View file

@ -76,7 +76,7 @@ void * align_with_offset(void * ptr, int offset) {
return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset; return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
} }
void benchmark_function(size_t size, size_t q_size, int64_t iterations, std::function<size_t(void)> function) { void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) {
int64_t min_time_us = INT64_MAX; int64_t min_time_us = INT64_MAX;
int64_t total_time_us = 0; int64_t total_time_us = 0;
int64_t min_time_cycles = INT64_MAX; int64_t min_time_cycles = INT64_MAX;