Merge branch 'master' into stablelm-support

This commit is contained in:
Galunid 2023-11-05 01:01:48 +01:00
commit 8917767f56
71 changed files with 7311 additions and 6369 deletions

View file

@ -1,7 +1,7 @@
--- ---
name: Bug template name: Bug template
about: Used to report bugs in llama.cpp about: Used to report bugs in llama.cpp
labels: ["bug"] labels: ["bug-unconfirmed"]
assignees: '' assignees: ''
--- ---

3
.gitignore vendored
View file

@ -15,6 +15,7 @@
.DS_Store .DS_Store
.build/ .build/
.cache/ .cache/
.ccls-cache/
.direnv/ .direnv/
.envrc .envrc
.swiftpm .swiftpm
@ -64,7 +65,7 @@ models-mnt
/parallel /parallel
/train-text-from-scratch /train-text-from-scratch
/vdot /vdot
build-info.h /common/build-info.cpp
arm_neon.h arm_neon.h
compile_commands.json compile_commands.json
CMakeSettings.json CMakeSettings.json

View file

@ -44,7 +44,7 @@ endif()
# general # general
option(LLAMA_STATIC "llama: static link libraries" OFF) option(LLAMA_STATIC "llama: static link libraries" OFF)
option(LLAMA_NATIVE "llama: enable -march=native flag" ON) option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
option(LLAMA_LTO "llama: enable link time optimization" OFF) option(LLAMA_LTO "llama: enable link time optimization" OFF)
# debug # debug
@ -82,6 +82,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use CUDA" OFF) option(LLAMA_CUBLAS "llama: use CUDA" OFF)
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF) #option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF) option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
@ -93,46 +94,12 @@ option(LLAMA_CLBLAST "llama: use CLBlast"
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ON) option(LLAMA_BUILD_SERVER "llama: build server example" ON)
#
# Build info header
#
# Generate initial build-info.h
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/.git")
# Is git submodule
if(NOT IS_DIRECTORY "${GIT_DIR}")
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}")
endif()
# Add a custom target for build-info.h
add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
# Add a custom command to rebuild build-info.h when .git/index changes
add_custom_command(
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
COMMENT "Generating build details from Git"
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDS "${GIT_DIR}/index"
VERBATIM
)
else()
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
endif()
# #
# Compile flags # Compile flags
# #
@ -277,13 +244,8 @@ if (LLAMA_BLAS)
endif() endif()
endif() endif()
if (LLAMA_K_QUANTS) if (LLAMA_QKK_64)
set(GGML_HEADERS_EXTRA k_quants.h)
set(GGML_SOURCES_EXTRA k_quants.c)
add_compile_definitions(GGML_USE_K_QUANTS)
if (LLAMA_QKK_64)
add_compile_definitions(GGML_QKK_64) add_compile_definitions(GGML_QKK_64)
endif()
endif() endif()
if (LLAMA_CUBLAS) if (LLAMA_CUBLAS)
@ -305,6 +267,9 @@ if (LLAMA_CUBLAS)
if (LLAMA_CUDA_FORCE_DMMV) if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif() endif()
if (LLAMA_CUDA_FORCE_MMQ)
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif()
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
if (DEFINED LLAMA_CUDA_DMMV_Y) if (DEFINED LLAMA_CUDA_DMMV_Y)
@ -331,6 +296,7 @@ if (LLAMA_CUBLAS)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else() else()
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
#set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
endif() endif()
endif() endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@ -404,6 +370,9 @@ if (LLAMA_HIPBLAS)
if (LLAMA_CUDA_FORCE_DMMV) if (LLAMA_CUDA_FORCE_DMMV)
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
endif() endif()
if (LLAMA_CUDA_FORCE_MMQ)
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
endif()
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
@ -665,6 +634,8 @@ add_library(ggml OBJECT
ggml-alloc.h ggml-alloc.h
ggml-backend.c ggml-backend.c
ggml-backend.h ggml-backend.h
ggml-quants.c
ggml-quants.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}

View file

@ -342,13 +342,9 @@ else
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
endif endif
ifndef LLAMA_NO_K_QUANTS
MK_CPPFLAGS += -DGGML_USE_K_QUANTS
OBJS += k_quants.o
ifdef LLAMA_QKK_64 ifdef LLAMA_QKK_64
MK_CPPFLAGS += -DGGML_QKK_64 MK_CPPFLAGS += -DGGML_QKK_64
endif endif
endif
ifndef LLAMA_NO_ACCELERATE ifndef LLAMA_NO_ACCELERATE
# Mac OS - include Accelerate framework. # Mac OS - include Accelerate framework.
@ -397,6 +393,9 @@ endif # CUDA_DOCKER_ARCH
ifdef LLAMA_CUDA_FORCE_DMMV ifdef LLAMA_CUDA_FORCE_DMMV
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # LLAMA_CUDA_FORCE_DMMV endif # LLAMA_CUDA_FORCE_DMMV
ifdef LLAMA_CUDA_FORCE_MMQ
NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
endif # LLAMA_CUDA_FORCE_MMQ
ifdef LLAMA_CUDA_DMMV_X ifdef LLAMA_CUDA_DMMV_X
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
else else
@ -494,11 +493,6 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI endif # LLAMA_MPI
ifndef LLAMA_NO_K_QUANTS
k_quants.o: k_quants.c k_quants.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_NO_K_QUANTS
# combine build flags with cmdline overrides # combine build flags with cmdline overrides
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS) override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS) override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
@ -539,13 +533,16 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
$(CC) $(CFLAGS) -c $< -o $@
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
COMMON_DEPS = $(COMMON_H_DEPS) common.o sampling.o grammar-parser.o COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
common.o: common/common.cpp $(COMMON_H_DEPS) common.o: common/common.cpp $(COMMON_H_DEPS)
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
@ -566,46 +563,46 @@ libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean: clean:
rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
# #
# Examples # Examples
# #
main: examples/main/main.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@echo @echo
@echo '==== Run ./main -h for help. ====' @echo '==== Run ./main -h for help. ===='
@echo @echo
infill: examples/infill/infill.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
batched: examples/batched/batched.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
batched-bench: examples/batched-bench/batched-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS) batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS) quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS) quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS) gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
@ -617,7 +614,7 @@ train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratc
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
@ -626,19 +623,19 @@ llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
ifdef LLAMA_METAL ifdef LLAMA_METAL
@ -651,7 +648,7 @@ swift: examples/batched.swift
(cd examples/batched.swift; make build) (cd examples/batched.swift; make build)
endif endif
build-info.h: $(wildcard .git/index) scripts/build-info.sh common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
@sh scripts/build-info.sh $(CC) > $@.tmp @sh scripts/build-info.sh $(CC) > $@.tmp
@if ! cmp -s $@.tmp $@; then \ @if ! cmp -s $@.tmp $@; then \
mv $@.tmp $@; \ mv $@.tmp $@; \
@ -659,13 +656,16 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
rm $@.tmp; \ rm $@.tmp; \
fi fi
build-info.o: common/build-info.cpp
$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
# #
# Tests # Tests
# #
tests: $(TEST_TARGETS) tests: $(TEST_TARGETS)
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS) benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
run-benchmark-matmult: benchmark-matmult run-benchmark-matmult: benchmark-matmult
@ -679,40 +679,40 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS) q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS) tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-c.o: tests/test-c.c llama.h tests/test-c.o: tests/test-c.c llama.h

View file

@ -42,13 +42,12 @@ let package = Package(
"llama.cpp", "llama.cpp",
"ggml-alloc.c", "ggml-alloc.c",
"ggml-backend.c", "ggml-backend.c",
"k_quants.c", "ggml-quants.c",
] + additionalSources, ] + additionalSources,
resources: resources, resources: resources,
publicHeadersPath: "spm-headers", publicHeadersPath: "spm-headers",
cSettings: [ cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
.define("GGML_USE_K_QUANTS"),
.define("GGML_USE_ACCELERATE") .define("GGML_USE_ACCELERATE")
// NOTE: NEW_LAPACK will required iOS version 16.4+ // NOTE: NEW_LAPACK will required iOS version 16.4+
// We should consider add this in the future when we drop support for iOS 14 // We should consider add this in the future when we drop support for iOS 14

View file

@ -2,7 +2,6 @@
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@ -11,8 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
### Hot topics ### Hot topics
- LLaVA support: https://github.com/ggerganov/llama.cpp/pull/3436 - ⚠️ **Upcoming change that might break functionality. Help with testing is needed:** https://github.com/ggerganov/llama.cpp/pull/3912
- ‼️ BPE tokenizer update: existing Falcon and Starcoder `.gguf` models will need to be reconverted: [#3252](https://github.com/ggerganov/llama.cpp/pull/3252)
---- ----

View file

@ -10,7 +10,6 @@ const Maker = struct {
builder: *std.build.Builder, builder: *std.build.Builder,
target: CrossTarget, target: CrossTarget,
optimize: Mode, optimize: Mode,
config_header: *ConfigHeader,
enable_lto: bool, enable_lto: bool,
include_dirs: ArrayList([]const u8), include_dirs: ArrayList([]const u8),
@ -41,26 +40,24 @@ const Maker = struct {
const commit_hash = try std.ChildProcess.exec( const commit_hash = try std.ChildProcess.exec(
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } }, .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
); );
const config_header = builder.addConfigHeader( try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
.{ .style = .blank, .include_path = "build-info.h" }, \\int LLAMA_BUILD_NUMBER = {};
.{ \\char const *LLAMA_COMMIT = "{s}";
.BUILD_NUMBER = 0, \\char const *LLAMA_COMPILER = "Zig {s}";
.BUILD_COMMIT = commit_hash.stdout[0 .. commit_hash.stdout.len - 1], // omit newline \\char const *LLAMA_BUILD_TARGET = "{s}";
.BUILD_COMPILER = builder.fmt("Zig {s}", .{zig_version}), \\
.BUILD_TARGET = try target.allocDescription(builder.allocator), , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
},
);
var m = Maker{ var m = Maker{
.builder = builder, .builder = builder,
.target = target, .target = target,
.optimize = builder.standardOptimizeOption(.{}), .optimize = builder.standardOptimizeOption(.{}),
.config_header = config_header,
.enable_lto = false, .enable_lto = false,
.include_dirs = ArrayList([]const u8).init(builder.allocator), .include_dirs = ArrayList([]const u8).init(builder.allocator),
.cflags = ArrayList([]const u8).init(builder.allocator), .cflags = ArrayList([]const u8).init(builder.allocator),
.cxxflags = ArrayList([]const u8).init(builder.allocator), .cxxflags = ArrayList([]const u8).init(builder.allocator),
.objs = ArrayList(*Compile).init(builder.allocator), .objs = ArrayList(*Compile).init(builder.allocator),
}; };
try m.addCFlag("-std=c11"); try m.addCFlag("-std=c11");
try m.addCxxFlag("-std=c++11"); try m.addCxxFlag("-std=c++11");
try m.addProjectInclude(&.{}); try m.addProjectInclude(&.{});
@ -72,7 +69,7 @@ const Maker = struct {
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize }); const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
if (o.target.getAbi() != .msvc) if (o.target.getAbi() != .msvc)
o.defineCMacro("_GNU_SOURCE", null); o.defineCMacro("_GNU_SOURCE", null);
o.addConfigHeader(m.config_header);
if (std.mem.endsWith(u8, src, ".c")) { if (std.mem.endsWith(u8, src, ".c")) {
o.addCSourceFiles(&.{src}, m.cflags.items); o.addCSourceFiles(&.{src}, m.cflags.items);
o.linkLibC(); o.linkLibC();
@ -85,7 +82,6 @@ const Maker = struct {
o.linkLibCpp(); o.linkLibCpp();
} }
} }
o.addConfigHeader(m.config_header);
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i }); for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
o.want_lto = m.enable_lto; o.want_lto = m.enable_lto;
return o; return o;
@ -105,7 +101,6 @@ const Maker = struct {
// linkLibCpp already add (libc++ + libunwind + libc) // linkLibCpp already add (libc++ + libunwind + libc)
e.linkLibCpp(); e.linkLibCpp();
} }
e.addConfigHeader(m.config_header);
m.builder.installArtifact(e); m.builder.installArtifact(e);
e.want_lto = m.enable_lto; e.want_lto = m.enable_lto;
return e; return e;
@ -116,16 +111,12 @@ pub fn build(b: *std.build.Builder) !void {
var make = try Maker.init(b); var make = try Maker.init(b);
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false; make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
try make.addFlag("-DGGML_USE_K_QUANTS");
const k_quants = make.obj("k_quants", "k_quants.c");
try make.objs.append(k_quants);
}
const ggml = make.obj("ggml", "ggml.c"); const ggml = make.obj("ggml", "ggml.c");
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c"); const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c"); const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
const llama = make.obj("llama", "llama.cpp"); const llama = make.obj("llama", "llama.cpp");
const buildinfo = make.obj("common", "common/build-info.cpp");
const common = make.obj("common", "common/common.cpp"); const common = make.obj("common", "common/common.cpp");
const console = make.obj("console", "common/console.cpp"); const console = make.obj("console", "common/console.cpp");
const sampling = make.obj("sampling", "common/sampling.cpp"); const sampling = make.obj("sampling", "common/sampling.cpp");
@ -133,14 +124,14 @@ pub fn build(b: *std.build.Builder) !void {
const train = make.obj("train", "common/train.cpp"); const train = make.obj("train", "common/train.cpp");
const clip = make.obj("clip", "examples/llava/clip.cpp"); const clip = make.obj("clip", "examples/llava/clip.cpp");
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser }); _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common }); _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common }); _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common }); _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train }); _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train }); _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser, clip }); const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
if (server.target.isWindows()) { if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32"); server.linkSystemLibrary("ws2_32");
} }

View file

@ -1,8 +1,46 @@
# common # common
# Build info header
#
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
# Is git submodule
if(NOT IS_DIRECTORY "${GIT_DIR}")
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
endif()
set(GIT_INDEX "${GIT_DIR}/index")
else()
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
set(GIT_INDEX "")
endif()
# Add a custom command to rebuild build-info.cpp when .git/index changes
add_custom_command(
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
COMMENT "Generating build details from Git"
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake"
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
VERBATIM
)
set(TARGET build_info)
add_library(${TARGET} OBJECT build-info.cpp)
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
set(TARGET common) set(TARGET common)
add_library(${TARGET} OBJECT add_library(${TARGET} STATIC
common.h common.h
common.cpp common.cpp
sampling.h sampling.h
@ -21,4 +59,4 @@ endif()
target_include_directories(${TARGET} PUBLIC .) target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11) target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries(${TARGET} PRIVATE llama) target_link_libraries(${TARGET} PRIVATE llama build_info)

4
common/build-info.cpp.in Normal file
View file

@ -0,0 +1,4 @@
int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";

View file

@ -1,5 +1,4 @@
#include "common.h" #include "common.h"
#include "build-info.h"
#include "llama.h" #include "llama.h"
#include <algorithm> #include <algorithm>
@ -103,9 +102,24 @@ void process_escapes(std::string& input) {
} }
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
bool result = true;
try {
if (!gpt_params_parse_ex(argc, argv, params)) {
gpt_print_usage(argc, argv, gpt_params());
exit(0);
}
}
catch (const std::invalid_argument & ex) {
fprintf(stderr, "%s\n", ex.what());
gpt_print_usage(argc, argv, gpt_params());
exit(1);
}
return result;
}
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
bool invalid_param = false; bool invalid_param = false;
std::string arg; std::string arg;
gpt_params default_params;
const std::string arg_prefix = "--"; const std::string arg_prefix = "--";
llama_sampling_params & sparams = params.sparams; llama_sampling_params & sparams = params.sparams;
@ -204,12 +218,52 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.rope_freq_scale = std::stof(argv[i]); params.rope_freq_scale = std::stof(argv[i]);
} else if (arg == "--rope-scaling") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::string value(argv[i]);
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
else { invalid_param = true; break; }
} else if (arg == "--rope-scale") { } else if (arg == "--rope-scale") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.rope_freq_scale = 1.0f/std::stof(argv[i]); params.rope_freq_scale = 1.0f/std::stof(argv[i]);
} else if (arg == "--yarn-orig-ctx") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_orig_ctx = std::stoi(argv[i]);
} else if (arg == "--yarn-ext-factor") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_ext_factor = std::stof(argv[i]);
} else if (arg == "--yarn-attn-factor") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_attn_factor = std::stof(argv[i]);
} else if (arg == "--yarn-beta-fast") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_beta_fast = std::stof(argv[i]);
} else if (arg == "--yarn-beta-slow") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_beta_slow = std::stof(argv[i]);
} else if (arg == "--memory-f32") { } else if (arg == "--memory-f32") {
params.memory_f16 = false; params.memory_f16 = false;
} else if (arg == "--top-p") { } else if (arg == "--top-p") {
@ -218,12 +272,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
sparams.top_p = std::stof(argv[i]); sparams.top_p = std::stof(argv[i]);
} else if (arg == "--min-p") {
if (++i >= argc) {
invalid_param = true;
break;
}
sparams.min_p = std::stof(argv[i]);
} else if (arg == "--temp") { } else if (arg == "--temp") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
sparams.temp = std::stof(argv[i]); sparams.temp = std::stof(argv[i]);
sparams.temp = std::max(sparams.temp, 0.0f);
} else if (arg == "--tfs") { } else if (arg == "--tfs") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -342,6 +403,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.n_sequences = std::stoi(argv[i]); params.n_sequences = std::stoi(argv[i]);
} else if (arg == "--p-accept" || arg == "-pa") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.p_accept = std::stof(argv[i]);
} else if (arg == "--p-split" || arg == "-ps") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.p_split = std::stof(argv[i]);
} else if (arg == "-m" || arg == "--model") { } else if (arg == "-m" || arg == "--model") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -547,11 +620,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
} else if (arg == "-h" || arg == "--help") { } else if (arg == "-h" || arg == "--help") {
gpt_print_usage(argc, argv, default_params); return false;
#ifndef LOG_DISABLE_LOGS
log_print_usage();
#endif // LOG_DISABLE_LOGS
exit(0);
} else if (arg == "--random-prompt") { } else if (arg == "--random-prompt") {
params.random_prompt = true; params.random_prompt = true;
} else if (arg == "--in-prefix-bos") { } else if (arg == "--in-prefix-bos") {
@ -610,22 +680,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
// End of Parse args for logging parameters // End of Parse args for logging parameters
#endif // LOG_DISABLE_LOGS #endif // LOG_DISABLE_LOGS
} else { } else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); throw std::invalid_argument("error: unknown argument: " + arg);
gpt_print_usage(argc, argv, default_params);
exit(1);
} }
} }
if (invalid_param) { if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); throw std::invalid_argument("error: invalid parameter for argument: " + arg);
gpt_print_usage(argc, argv, default_params);
exit(1);
} }
if (params.prompt_cache_all && if (params.prompt_cache_all &&
(params.interactive || params.interactive_first || (params.interactive || params.interactive_first ||
params.instruct)) { params.instruct)) {
fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
gpt_print_usage(argc, argv, default_params); throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
exit(1);
} }
if (params.escape) { if (params.escape) {
@ -644,6 +709,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
const llama_sampling_params & sparams = params.sparams; const llama_sampling_params & sparams = params.sparams;
printf("\n");
printf("usage: %s [options]\n", argv[0]); printf("usage: %s [options]\n", argv[0]);
printf("\n"); printf("\n");
printf("options:\n"); printf("options:\n");
@ -678,6 +744,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z); printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p); printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n); printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
@ -700,9 +767,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --cfg-negative-prompt-file FNAME\n"); printf(" --cfg-negative-prompt-file FNAME\n");
printf(" negative prompt file to use for guidance. (default: empty)\n"); printf(" negative prompt file to use for guidance. (default: empty)\n");
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale); printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n"); printf(" --rope-scaling {none,linear,yarn}\n");
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n");
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n"); printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n"); printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
printf(" --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)\n");
printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
printf(" --no-penalize-nl do not penalize newline token\n"); printf(" --no-penalize-nl do not penalize newline token\n");
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
@ -716,6 +790,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel); printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences); printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
printf(" -pa N, --p-accept N speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n"); printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
@ -743,7 +819,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
#endif #endif
printf(" --verbose-prompt print prompt before generation\n"); printf(" --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n"); printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
@ -754,6 +830,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -ld LOGDIR, --logdir LOGDIR\n"); printf(" -ld LOGDIR, --logdir LOGDIR\n");
printf(" path under which to save YAML logs (no logging if unset)\n"); printf(" path under which to save YAML logs (no logging if unset)\n");
printf("\n"); printf("\n");
#ifndef LOG_DISABLE_LOGS
log_print_usage();
#endif // LOG_DISABLE_LOGS
} }
std::string get_system_info(const gpt_params & params) { std::string get_system_info(const gpt_params & params) {
@ -816,8 +895,14 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.f16_kv = params.memory_f16; cparams.f16_kv = params.memory_f16;
cparams.logits_all = params.logits_all; cparams.logits_all = params.logits_all;
cparams.embedding = params.embedding; cparams.embedding = params.embedding;
cparams.rope_scaling_type = params.rope_scaling_type;
cparams.rope_freq_base = params.rope_freq_base; cparams.rope_freq_base = params.rope_freq_base;
cparams.rope_freq_scale = params.rope_freq_scale; cparams.rope_freq_scale = params.rope_freq_scale;
cparams.yarn_ext_factor = params.yarn_ext_factor;
cparams.yarn_attn_factor = params.yarn_attn_factor;
cparams.yarn_beta_fast = params.yarn_beta_fast;
cparams.yarn_beta_slow = params.yarn_beta_slow;
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
return cparams; return cparams;
} }
@ -888,7 +973,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), }; std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
llama_kv_cache_tokens_rm(lctx, -1, -1); llama_kv_cache_clear(lctx);
llama_reset_timings(lctx); llama_reset_timings(lctx);
} }
@ -1127,8 +1212,8 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) { const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
const llama_sampling_params & sparams = params.sparams; const llama_sampling_params & sparams = params.sparams;
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT); fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
fprintf(stream, "build_number: %d\n", BUILD_NUMBER); fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false"); fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false"); fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
@ -1274,6 +1359,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency()); fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p); fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
} }

View file

@ -9,6 +9,7 @@
#define LOG_NO_FILE_LINE_FUNCTION #define LOG_NO_FILE_LINE_FUNCTION
#include "log.h" #include "log.h"
#include <cmath>
#include <string> #include <string>
#include <vector> #include <vector>
#include <random> #include <random>
@ -26,10 +27,16 @@
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
#define print_build_info() do { \ #define print_build_info() do { \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
} while(0) } while(0)
// build info
extern int LLAMA_BUILD_NUMBER;
extern char const *LLAMA_COMMIT;
extern char const *LLAMA_COMPILER;
extern char const *LLAMA_BUILD_TARGET;
// //
// CLI argument parsing // CLI argument parsing
// //
@ -37,6 +44,7 @@ int32_t get_num_physical_cores();
struct gpt_params { struct gpt_params {
uint32_t seed = -1; // RNG seed uint32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores(); int32_t n_threads = get_num_physical_cores();
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict
@ -47,6 +55,8 @@ struct gpt_params {
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode int32_t n_sequences = 1; // number of sequences to decode
float p_accept = 0.5f; // speculative decoding accept probability
float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
@ -54,6 +64,13 @@ struct gpt_params {
int32_t n_beams = 0; // if non-zero then use beam search of given width. int32_t n_beams = 0; // if non-zero then use beam search of given width.
float rope_freq_base = 0.0f; // RoPE base frequency float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = 32.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
// pinging @cebtenzzre
// // sampling parameters // // sampling parameters
struct llama_sampling_params sparams; struct llama_sampling_params sparams;
@ -110,6 +127,8 @@ struct gpt_params {
std::string image = ""; // path to an image file std::string image = ""; // path to an image file
}; };
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
bool gpt_params_parse(int argc, char ** argv, gpt_params & params); bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
void gpt_print_usage(int argc, char ** argv, const gpt_params & params); void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

View file

@ -97,6 +97,15 @@
#define LOG_TEE_TARGET stderr #define LOG_TEE_TARGET stderr
#endif #endif
// Utility for synchronizing log configuration state
// since std::optional was introduced only in c++17
enum LogTriState
{
LogTriStateSame,
LogTriStateFalse,
LogTriStateTrue
};
// Utility to obtain "pid" like unique process id and use it when creating log files. // Utility to obtain "pid" like unique process id and use it when creating log files.
inline std::string log_get_pid() inline std::string log_get_pid()
{ {
@ -118,16 +127,26 @@ inline std::string log_get_pid()
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log" // invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
// where the number is a runtime id of the current thread. // where the number is a runtime id of the current thread.
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension) #define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension) inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
{ {
static bool _multilog = false;
if (multilog != LogTriStateSame)
{
_multilog = multilog == LogTriStateTrue;
}
std::stringstream buf; std::stringstream buf;
buf << log_file_basename; buf << log_file_basename;
if (_multilog)
{
buf << "."; buf << ".";
buf << log_get_pid(); buf << log_get_pid();
}
buf << "."; buf << ".";
buf << log_file_extension; buf << log_file_extension;
@ -212,15 +231,6 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
#define LOG_TEE_FLF_VAL ,"" #define LOG_TEE_FLF_VAL ,""
#endif #endif
// Utility for synchronizing log configuration state
// since std::optional was introduced only in c++17
enum LogTriState
{
LogTriStateSame,
LogTriStateFalse,
LogTriStateTrue
};
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
// USE LOG() INSTEAD // USE LOG() INSTEAD
// //
@ -314,16 +324,23 @@ enum LogTriState
#endif #endif
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr) inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
{ {
static bool _initialized{false}; static bool _initialized = false;
static bool _disabled{(filename.empty() && target == nullptr)}; static bool _append = false;
static bool _disabled = filename.empty() && target == nullptr;
static std::string log_current_filename{filename}; static std::string log_current_filename{filename};
static FILE *log_current_target{target}; static FILE *log_current_target{target};
static FILE *logfile = nullptr; static FILE *logfile = nullptr;
if (change) if (change)
{ {
if (append != LogTriStateSame)
{
_append = append == LogTriStateTrue;
return logfile;
}
if (disable == LogTriStateTrue) if (disable == LogTriStateTrue)
{ {
// Disable primary target // Disable primary target
@ -376,7 +393,7 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri
} }
} }
logfile = fopen(filename.c_str(), "w"); logfile = fopen(filename.c_str(), _append ? "a" : "w");
} }
if (!logfile) if (!logfile)
@ -397,9 +414,9 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri
} }
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME) inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
{ {
return log_handler1_impl(change, disable, filename, target); return log_handler1_impl(change, append, disable, filename, target);
} }
// Disables logs entirely at runtime. // Disables logs entirely at runtime.
@ -410,7 +427,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTri
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
inline FILE *log_disable_impl() inline FILE *log_disable_impl()
{ {
return log_handler1_impl(true, LogTriStateTrue); return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
} }
// Enables logs at runtime. // Enables logs at runtime.
@ -419,19 +436,31 @@ inline FILE *log_disable_impl()
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
inline FILE *log_enable_impl() inline FILE *log_enable_impl()
{ {
return log_handler1_impl(true, LogTriStateFalse); return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
} }
// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*) // Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
#define log_set_target(target) log_set_target_impl(target) #define log_set_target(target) log_set_target_impl(target)
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); } inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); } inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
inline FILE *log_handler() { return log_handler1_impl(); } inline FILE *log_handler() { return log_handler1_impl(); }
// Enable or disable creating separate log files for each run.
// can ONLY be invoked BEFORE first log use.
#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
// Enable or disable append mode for log file.
// can ONLY be invoked BEFORE first log use.
#define log_append(enable) log_append_impl(enable)
// INTERNAL, DO NOT USE
inline FILE *log_append_impl(bool enable)
{
return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
}
inline void log_test() inline void log_test()
{ {
log_disable(); log_disable();
@ -493,6 +522,18 @@ inline bool log_param_single_parse(const std::string & param)
return true; return true;
} }
if (param == "--log-new")
{
log_multilog(true);
return true;
}
if (param == "--log-append")
{
log_append(true);
return true;
}
return false; return false;
} }
@ -522,7 +563,9 @@ inline void log_print_usage()
printf(" --log-disable Disable trace logs\n"); printf(" --log-disable Disable trace logs\n");
printf(" --log-enable Enable trace logs\n"); printf(" --log-enable Enable trace logs\n");
printf(" --log-file Specify a log filename (without extension)\n"); printf(" --log-file Specify a log filename (without extension)\n");
printf(" Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */ printf(" --log-new Create a separate new log file on start. "
"Each log file will have unique name: \"<name>.<ID>.log\"\n");
printf(" --log-append Don't truncate the old log file.\n");
} }
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv) #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)

View file

@ -39,6 +39,7 @@ void llama_sampling_free(struct llama_sampling_context * ctx) {
void llama_sampling_reset(llama_sampling_context * ctx) { void llama_sampling_reset(llama_sampling_context * ctx) {
if (ctx->grammar != NULL) { if (ctx->grammar != NULL) {
llama_grammar_free(ctx->grammar); llama_grammar_free(ctx->grammar);
ctx->grammar = NULL;
} }
if (!ctx->parsed_grammar.rules.empty()) { if (!ctx->parsed_grammar.rules.empty()) {
@ -89,10 +90,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
snprintf(result, sizeof(result), snprintf(result, sizeof(result),
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n" "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present, params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
params.mirostat, params.mirostat_eta, params.mirostat_tau); params.mirostat, params.mirostat_eta, params.mirostat_tau);
return std::string(result); return std::string(result);
@ -110,6 +111,7 @@ llama_token llama_sampling_sample(
const float temp = params.temp; const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
const float top_p = params.top_p; const float top_p = params.top_p;
const float min_p = params.min_p;
const float tfs_z = params.tfs_z; const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p; const float typical_p = params.typical_p;
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
@ -167,8 +169,12 @@ llama_token llama_sampling_sample(
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar); llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
} }
if (temp <= 0) { if (temp < 0.0) {
// greedy sampling // greedy sampling, with probs
llama_sample_softmax(ctx_main, &cur_p);
id = cur_p.data[0].id;
} else if (temp == 0.0) {
// greedy sampling, no probs
id = llama_sample_token_greedy(ctx_main, &cur_p); id = llama_sample_token_greedy(ctx_main, &cur_p);
} else { } else {
if (mirostat == 1) { if (mirostat == 1) {
@ -186,6 +192,7 @@ llama_token llama_sampling_sample(
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep);
llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep);
llama_sample_temp (ctx_main, &cur_p, temp); llama_sample_temp (ctx_main, &cur_p, temp);
id = llama_sample_token(ctx_main, &cur_p); id = llama_sample_token(ctx_main, &cur_p);

View file

@ -14,6 +14,7 @@ typedef struct llama_sampling_params {
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t top_k = 40; // <= 0 to use vocab size int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // 1.0 = disabled float temp = 0.80f; // 1.0 = disabled

View file

@ -1045,6 +1045,7 @@ struct train_params_common get_default_train_params_common() {
params.n_batch = 8; params.n_batch = 8;
params.n_gradient_accumulation = 1; params.n_gradient_accumulation = 1;
params.n_epochs = -1; params.n_epochs = -1;
params.n_gpu_layers = 0;
params.custom_n_ctx = false; params.custom_n_ctx = false;
@ -1080,6 +1081,7 @@ struct train_params_common get_default_train_params_common() {
params.adam_beta2 = 0.999f; params.adam_beta2 = 0.999f;
params.adam_gclip = 1.0f; params.adam_gclip = 1.0f;
params.adam_eps_f = 0.0f; params.adam_eps_f = 0.0f;
return params; return params;
} }

View file

@ -44,6 +44,7 @@ struct train_params_common {
int n_batch; int n_batch;
int n_gradient_accumulation; int n_gradient_accumulation;
int n_epochs; int n_epochs;
int n_gpu_layers;
bool custom_n_ctx; bool custom_n_ctx;

View file

@ -163,7 +163,8 @@ gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]: if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
if "type" in hparams["rope_scaling"]: if "type" in hparams["rope_scaling"]:
if hparams["rope_scaling"]["type"] == "linear": if hparams["rope_scaling"]["type"] == "linear":
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"]) gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
# TOKENIZATION # TOKENIZATION

View file

@ -151,8 +151,11 @@ class Params:
n_head_kv: int n_head_kv: int
f_norm_eps: float f_norm_eps: float
rope_scaling_type: gguf.RopeScalingType | None = None
f_rope_freq_base: float | None = None f_rope_freq_base: float | None = None
f_rope_scale: float | None = None f_rope_scale: float | None = None
n_orig_ctx: int | None = None
rope_finetuned: bool | None = None
ftype: GGMLFileType | None = None ftype: GGMLFileType | None = None
@ -198,20 +201,20 @@ class Params:
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path)) config = json.load(open(config_path))
n_vocab = config["vocab_size"] rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
n_embd = config["hidden_size"]
n_layer = config["num_hidden_layers"]
n_ff = config["intermediate_size"]
n_head = config["num_attention_heads"]
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
f_norm_eps = config["rms_norm_eps"]
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
rope_scaling = config.get("rope_scaling") rope_scaling = config.get("rope_scaling")
if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear":
f_rope_scale = config["rope_scaling"].get("factor") if rope_scaling is not None and (typ := rope_scaling.get("type")):
rope_factor = rope_scaling.get("factor")
f_rope_scale = rope_factor
if typ == "linear":
rope_scaling_type = gguf.RopeScalingType.LINEAR
elif typ == "yarn":
rope_scaling_type = gguf.RopeScalingType.YARN
n_orig_ctx = rope_scaling['original_max_position_embeddings']
rope_finetuned = rope_scaling['finetuned']
else: else:
f_rope_scale = None raise NotImplementedError(f'Unknown rope scaling type: {typ}')
if "max_sequence_length" in config: if "max_sequence_length" in config:
n_ctx = config["max_sequence_length"] n_ctx = config["max_sequence_length"]
@ -222,16 +225,19 @@ class Params:
"Suggestion: provide 'config.json' of the model in the same directory containing model files.") "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
return Params( return Params(
n_vocab = n_vocab, n_vocab = config["vocab_size"],
n_embd = n_embd, n_embd = config["hidden_size"],
n_layer = n_layer, n_layer = config["num_hidden_layers"],
n_ctx = n_ctx, n_ctx = n_ctx,
n_ff = n_ff, n_ff = config["intermediate_size"],
n_head = n_head, n_head = (n_head := config["num_attention_heads"]),
n_head_kv = n_head_kv, n_head_kv = config.get("num_key_value_heads", n_head),
f_norm_eps = f_norm_eps, f_norm_eps = config["rms_norm_eps"],
f_rope_freq_base = f_rope_freq_base, f_rope_freq_base = config.get("rope_theta"),
rope_scaling_type = rope_scaling_type,
f_rope_scale = f_rope_scale, f_rope_scale = f_rope_scale,
n_orig_ctx = n_orig_ctx,
rope_finetuned = rope_finetuned,
) )
# LLaMA v2 70B params.json # LLaMA v2 70B params.json
@ -240,17 +246,8 @@ class Params:
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path)) config = json.load(open(config_path))
n_vocab = config["vocab_size"] if "vocab_size" in config else -1
n_embd = config["dim"]
n_layer = config["n_layers"]
n_ff = -1
n_head = config["n_heads"]
n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
f_norm_eps = config["norm_eps"]
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
# hack to determine LLaMA v1 vs v2 vs CodeLlama # hack to determine LLaMA v1 vs v2 vs CodeLlama
if f_rope_freq_base == 1000000: if config.get("rope_theta") == 1000000:
# CodeLlama # CodeLlama
n_ctx = 16384 n_ctx = 16384
elif config["norm_eps"] == 1e-05: elif config["norm_eps"] == 1e-05:
@ -260,22 +257,16 @@ class Params:
# LLaMA v1 # LLaMA v1
n_ctx = 2048 n_ctx = 2048
if n_vocab == -1:
n_vocab = model["tok_embeddings.weight"].shape[0]
if n_ff == -1:
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
return Params( return Params(
n_vocab = n_vocab, n_vocab = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
n_embd = n_embd, n_embd = config["dim"],
n_layer = n_layer, n_layer = config["n_layers"],
n_ctx = n_ctx, n_ctx = n_ctx,
n_ff = n_ff, n_ff = model["layers.0.feed_forward.w1.weight"].shape[0],
n_head = n_head, n_head = (n_head := config["n_heads"]),
n_head_kv = n_head_kv, n_head_kv = config.get("n_kv_heads", n_head),
f_norm_eps = f_norm_eps, f_norm_eps = config["norm_eps"],
f_rope_freq_base = f_rope_freq_base, f_rope_freq_base = config.get("rope_theta"),
) )
@staticmethod @staticmethod
@ -366,15 +357,18 @@ class SentencePieceVocab:
added_tokens = {} added_tokens = {}
vocab_size: int = self.sentencepiece_tokenizer.vocab_size() vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids:
raise Exception(f"Expected added token IDs to be sequential and start at {vocab_size}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
self.added_tokens_list = [text for (text, idx) in items] expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
self.vocab_size_base: int = vocab_size actual_new_ids = sorted(new_tokens.keys())
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
if expected_new_ids != actual_new_ids:
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
# Token pieces that were added to the base vocabulary.
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
self.vocab_size_base = vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens self.fname_added_tokens = fname_added_tokens
@ -828,8 +822,16 @@ class OutputFile:
if params.f_rope_freq_base is not None: if params.f_rope_freq_base is not None:
self.gguf.add_rope_freq_base(params.f_rope_freq_base) self.gguf.add_rope_freq_base(params.f_rope_freq_base)
if params.f_rope_scale is not None: if params.rope_scaling_type:
self.gguf.add_rope_scale_linear(params.f_rope_scale) assert params.f_rope_scale is not None
self.gguf.add_rope_scaling_type(params.rope_scaling_type)
self.gguf.add_rope_scaling_factor(params.f_rope_scale)
if params.n_orig_ctx is not None:
self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
if params.rope_finetuned is not None:
self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
if params.ftype is not None: if params.ftype is not None:
self.gguf.add_file_type(params.ftype) self.gguf.add_file_type(params.ftype)

View file

@ -154,6 +154,10 @@ int main(int argc, char ** argv) {
} }
} }
LOG_TEE("\n");
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
@ -181,7 +185,7 @@ int main(int argc, char ** argv) {
const auto t_pp_start = ggml_time_us(); const auto t_pp_start = ggml_time_us();
llama_kv_cache_tokens_rm(ctx, -1, -1); llama_kv_cache_clear(ctx);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) { if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__); LOG_TEE("%s: llama_decode() failed\n", __func__);

View file

@ -11,7 +11,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (argc == 1 || argv[1][0] == '-') { if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN]\n" , argv[0]); printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
return 1 ; return 1 ;
} }
@ -21,6 +21,9 @@ int main(int argc, char ** argv) {
// total length of the sequences including the prompt // total length of the sequences including the prompt
int n_len = 32; int n_len = 32;
// number of layers to offload to the GPU
int n_gpu_layers = 0;
if (argc >= 2) { if (argc >= 2) {
params.model = argv[1]; params.model = argv[1];
} }
@ -37,6 +40,10 @@ int main(int argc, char ** argv) {
n_len = std::atoi(argv[4]); n_len = std::atoi(argv[4]);
} }
if (argc >= 6) {
n_gpu_layers = std::atoi(argv[5]);
}
if (params.prompt.empty()) { if (params.prompt.empty()) {
params.prompt = "Hello my name is"; params.prompt = "Hello my name is";
} }
@ -49,7 +56,7 @@ int main(int argc, char ** argv) {
llama_model_params model_params = llama_model_default_params(); llama_model_params model_params = llama_model_default_params();
// model_params.n_gpu_layers = 99; // offload all layers to the GPU model_params.n_gpu_layers = n_gpu_layers;
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);

View file

@ -1,9 +1,6 @@
set(TARGET benchmark) set(TARGET benchmark)
add_executable(${TARGET} benchmark-matmult.cpp) add_executable(${TARGET} benchmark-matmult.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common) target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -1,4 +1,3 @@
#include "build-info.h"
#include "common.h" #include "common.h"
#include "ggml.h" #include "ggml.h"

View file

@ -3,6 +3,3 @@ add_executable(${TARGET} embedding.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -1,4 +1,3 @@
#include "build-info.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"

View file

@ -642,8 +642,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
const int rope_mode = 0; const int rope_mode = 0;
return ggml_rope_custom(ctx, return ggml_rope_custom(ctx,
t, KQ_pos, n_rot, rope_mode, n_ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
rope_freq_base, rope_freq_scale); rope_freq_base, rope_freq_scale, 0.0f, 0.0f, 0.0f, 0.0f
);
}; };
set_name(tokens_input, "tokens_input"); set_name(tokens_input, "tokens_input");
@ -652,7 +653,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32); GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
if (ggml_is_quantized(a->type)) { if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32); return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
} else if (a->type == GGML_TYPE_F32) { } else if (a->type == GGML_TYPE_F32) {
return ggml_add(ctx, a, b); return ggml_add(ctx, a, b);
@ -1459,6 +1460,17 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
} }
params->n_rank_w3 = std::stoi(argv[i]); params->n_rank_w3 = std::stoi(argv[i]);
params->custom_n_rank_w3 = true; params->custom_n_rank_w3 = true;
} else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params->common.n_gpu_layers = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif
} else { } else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
train_print_usage(argc, argv, &default_params); train_print_usage(argc, argv, &default_params);
@ -1545,6 +1557,7 @@ int main(int argc, char ** argv) {
srand(params.common.seed); srand(params.common.seed);
struct llama_model_params llama_mparams = llama_model_default_params(); struct llama_model_params llama_mparams = llama_model_default_params();
llama_mparams.n_gpu_layers = params.common.n_gpu_layers;
llama_mparams.vocab_only = false; llama_mparams.vocab_only = false;
printf("%s: model base = '%s'\n", __func__, params.fn_model_base); printf("%s: model base = '%s'\n", __func__, params.fn_model_base);

View file

@ -0,0 +1,34 @@
#!/bin/bash
cd `dirname $0`
cd ../..
EXE="./finetune"
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
while getopts "dg" opt; do
case $opt in
d)
DEBUGGER="gdb --args"
;;
g)
EXE="./build/bin/Release/finetune"
GPUARG="--gpu-layers 25"
;;
esac
done
$DEBUGGER $EXE \
--model-base $MODEL \
$GPUARG \
--checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \
--checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
--lora-out lora-ol3b-shakespeare-ITERATION.bin \
--train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
--save-every 10 \
--threads 10 --adam-iter 30 --batch 4 --ctx 64 \
--use-checkpointing

View file

@ -3,6 +3,3 @@ add_executable(${TARGET} infill.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -2,7 +2,6 @@
#include "console.h" #include "console.h"
#include "llama.h" #include "llama.h"
#include "build-info.h"
#include "grammar-parser.h" #include "grammar-parser.h"
#include <cassert> #include <cassert>
@ -184,8 +183,8 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
} }
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL); params.seed = time(NULL);

View file

@ -3,6 +3,3 @@ add_executable(${TARGET} llama-bench.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -19,7 +19,6 @@
#include "ggml.h" #include "ggml.h"
#include "llama.h" #include "llama.h"
#include "common.h" #include "common.h"
#include "build-info.h"
#include "ggml-cuda.h" #include "ggml-cuda.h"
// utils // utils
@ -641,8 +640,8 @@ struct test {
} }
}; };
const std::string test::build_commit = BUILD_COMMIT; const std::string test::build_commit = LLAMA_COMMIT;
const int test::build_number = BUILD_NUMBER; const int test::build_number = LLAMA_BUILD_NUMBER;
const bool test::cuda = !!ggml_cpu_has_cublas(); const bool test::cuda = !!ggml_cpu_has_cublas();
const bool test::opencl = !!ggml_cpu_has_clblast(); const bool test::opencl = !!ggml_cpu_has_clblast();
const bool test::metal = !!ggml_cpu_has_metal(); const bool test::metal = !!ggml_cpu_has_metal();
@ -1037,7 +1036,7 @@ int main(int argc, char ** argv) {
test t(inst, lmodel, ctx); test t(inst, lmodel, ctx);
llama_kv_cache_tokens_rm(ctx, -1, -1); llama_kv_cache_clear(ctx);
// warmup run // warmup run
if (t.n_prompt > 0) { if (t.n_prompt > 0) {
@ -1048,7 +1047,7 @@ int main(int argc, char ** argv) {
} }
for (int i = 0; i < params.reps; i++) { for (int i = 0; i < params.reps; i++) {
llama_kv_cache_tokens_rm(ctx, -1, -1); llama_kv_cache_clear(ctx);
uint64_t t_start = get_time_ns(); uint64_t t_start = get_time_ns();
if (t.n_prompt > 0) { if (t.n_prompt > 0) {

View file

@ -5,9 +5,6 @@ target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if (NOT MSVC) if (NOT MSVC)
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
endif()
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif() endif()
set(TARGET llava) set(TARGET llava)
@ -15,6 +12,3 @@ add_executable(${TARGET} llava.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -16,6 +16,8 @@ add_library(common OBJECT
${_common_path}/console.cpp ${_common_path}/console.cpp
${_common_path}/grammar-parser.h ${_common_path}/grammar-parser.h
${_common_path}/grammar-parser.cpp ${_common_path}/grammar-parser.cpp
${_common_path}/sampling.h
${_common_path}/sampling.cpp
) )
# WARNING: because build-info.h is auto-generated, it will only # WARNING: because build-info.h is auto-generated, it will only

View file

@ -3,6 +3,3 @@ add_executable(${TARGET} main.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -208,6 +208,14 @@ Top-p sampling, also known as nucleus sampling, is another text generation metho
Example usage: `--top-p 0.95` Example usage: `--top-p 0.95`
### Min P Sampling
- `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.05).
The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.
Example usage: `--min-p 0.05`
### Tail Free Sampling (TFS) ### Tail Free Sampling (TFS)
- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled). - `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).

View file

@ -2,7 +2,6 @@
#include "console.h" #include "console.h"
#include "llama.h" #include "llama.h"
#include "build-info.h"
#include <cassert> #include <cassert>
#include <cinttypes> #include <cinttypes>
@ -153,8 +152,8 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
} }
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL); params.seed = time(NULL);
@ -298,7 +297,7 @@ int main(int argc, char ** argv) {
} }
// remove any "future" tokens that we might have inherited from the previous session // remove any "future" tokens that we might have inherited from the previous session
llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1); llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
} }
LOGLN( LOGLN(

View file

@ -3,6 +3,3 @@ add_executable(${TARGET} parallel.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -1,8 +1,6 @@
// A basic application simulating a server with multiple clients. // A basic application simulating a server with multiple clients.
// The clients submite requests to the server and they are processed in parallel. // The clients submite requests to the server and they are processed in parallel.
#include "build-info.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"

View file

@ -3,6 +3,3 @@ add_executable(${TARGET} perplexity.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -1,4 +1,3 @@
#include "build-info.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
@ -210,7 +209,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
const auto t_start = std::chrono::high_resolution_clock::now(); const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache // clear the KV cache
llama_kv_cache_tokens_rm(ctx, -1, -1); llama_kv_cache_clear(ctx);
for (int j = 0; j < num_batches; ++j) { for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch; const int batch_start = start + j * n_batch;
@ -339,7 +338,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
const auto t_start = std::chrono::high_resolution_clock::now(); const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache // clear the KV cache
llama_kv_cache_tokens_rm(ctx, -1, -1); llama_kv_cache_clear(ctx);
for (int j = 0; j < num_batches; ++j) { for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch; const int batch_start = start + j * n_batch;
@ -573,7 +572,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
} }
// clear the KV cache // clear the KV cache
llama_kv_cache_tokens_rm(ctx, -1, -1); llama_kv_cache_clear(ctx);
auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab); auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
if (logits.empty()) { if (logits.empty()) {

View file

@ -1,6 +1,6 @@
set(TARGET quantize-stats) set(TARGET quantize-stats)
add_executable(${TARGET} quantize-stats.cpp) add_executable(${TARGET} quantize-stats.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common) target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,5 +1,4 @@
#define LLAMA_API_INTERNAL #define LLAMA_API_INTERNAL
#include "build-info.h"
#include "common.h" #include "common.h"
#include "ggml.h" #include "ggml.h"
#include "llama.h" #include "llama.h"

View file

@ -1,9 +1,6 @@
set(TARGET quantize) set(TARGET quantize)
add_executable(${TARGET} quantize.cpp) add_executable(${TARGET} quantize.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common) target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -1,4 +1,3 @@
#include "build-info.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
@ -18,7 +17,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", }, { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", }, { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", }, { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
#ifdef GGML_USE_K_QUANTS
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
@ -31,7 +29,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", }, { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", }, { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", }, { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
#endif
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
@ -70,13 +67,14 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
} }
// usage: // usage:
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
// //
[[noreturn]] [[noreturn]]
static void usage(const char * executable) { static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
printf("\nAllowed quantization types:\n"); printf("\nAllowed quantization types:\n");
for (auto & it : QUANT_OPTIONS) { for (auto & it : QUANT_OPTIONS) {
if (it.name != "COPY") { if (it.name != "COPY") {
@ -103,6 +101,8 @@ int main(int argc, char ** argv) {
params.quantize_output_tensor = false; params.quantize_output_tensor = false;
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) { } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
params.allow_requantize = true; params.allow_requantize = true;
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
params.pure = true;
} else { } else {
usage(argv[0]); usage(argv[0]);
} }

View file

@ -3,6 +3,3 @@ add_executable(${TARGET} save-load-state.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -1,4 +1,3 @@
#include "build-info.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"

View file

@ -11,6 +11,3 @@ if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif() endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -1,6 +1,5 @@
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "build-info.h"
#include "grammar-parser.h" #include "grammar-parser.h"
#include "../llava/clip.h" #include "../llava/clip.h"
@ -149,6 +148,7 @@ struct task_server {
task_type type; task_type type;
json data; json data;
bool infill_mode = false; bool infill_mode = false;
bool embedding_mode = false;
}; };
struct task_result { struct task_result {
@ -371,6 +371,7 @@ struct llama_client_slot
std::vector<completion_token_output> generated_token_probs; std::vector<completion_token_output> generated_token_probs;
bool infill = false; bool infill = false;
bool embedding = false;
bool has_next_token = true; bool has_next_token = true;
bool truncated = false; bool truncated = false;
bool stopped_eos = false; bool stopped_eos = false;
@ -454,7 +455,7 @@ struct llama_client_slot
} }
void release() { void release() {
if (state == PROCESSING) if (state == IDLE || state == PROCESSING)
{ {
t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3; t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
command = RELEASE; command = RELEASE;
@ -754,6 +755,7 @@ struct llama_server_context
} }
slot->params.antiprompt.clear(); slot->params.antiprompt.clear();
const auto &stop = data.find("stop"); const auto &stop = data.find("stop");
if (stop != data.end() && stop->is_array()) if (stop != data.end() && stop->is_array())
{ {
@ -856,7 +858,7 @@ struct llama_server_context
void kv_cache_clear() { void kv_cache_clear() {
// clear the entire KV cache // clear the entire KV cache
llama_kv_cache_tokens_rm(ctx, -1, -1); llama_kv_cache_clear(ctx);
clean_kv_cache = false; clean_kv_cache = false;
} }
@ -867,7 +869,7 @@ struct llama_server_context
kv_cache_clear(); kv_cache_clear();
for (int32_t i = 0; i < batch.n_tokens; ++i) for (int i = 0; i < (int) system_tokens.size(); ++i)
{ {
llama_batch_add(batch, system_tokens[i], i, { 0 }, false); llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
} }
@ -894,16 +896,8 @@ struct llama_server_context
{ {
slot.release(); slot.release();
} }
wait_all_are_idle();
all_slots_are_idle = true;
// wait until system prompt load
system_need_update = true; system_need_update = true;
while (system_need_update)
{
std::this_thread::sleep_for(std::chrono::milliseconds(5));
}
// system prompt loaded, continue
} }
void process_system_prompt_data(const json &sys_props) { void process_system_prompt_data(const json &sys_props) {
@ -915,26 +909,6 @@ struct llama_server_context
{ {
notify_system_prompt_changed(); notify_system_prompt_changed();
} }
else
{
system_need_update = true;
}
}
void wait_all_are_idle() {
bool wait = true;
while (wait)
{
wait = false;
for (auto &slot : slots)
{
if (!slot.available())
{
wait = true;
break;
}
}
}
} }
static size_t find_stopping_strings(const std::string &text, const size_t last_token_size, static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
@ -965,7 +939,6 @@ struct llama_server_context
slot.has_next_token = false; slot.has_next_token = false;
} }
stop_pos = pos; stop_pos = pos;
} }
} }
@ -1272,13 +1245,14 @@ struct llama_server_context
queue_results.push_back(res); queue_results.push_back(res);
} }
int request_completion(json data, bool infill) int request_completion(json data, bool infill, bool embedding)
{ {
std::lock_guard<std::mutex> lock(mutex_tasks); std::lock_guard<std::mutex> lock(mutex_tasks);
task_server task; task_server task;
task.id = id_gen++; task.id = id_gen++;
task.data = data; task.data = data;
task.infill_mode = infill; task.infill_mode = infill;
task.embedding_mode = embedding;
task.type = COMPLETION_TASK; task.type = COMPLETION_TASK;
queue_tasks.push_back(task); queue_tasks.push_back(task);
return task.id; return task.id;
@ -1404,7 +1378,7 @@ struct llama_server_context
{ {
LOG_TEE("slot unavailable\n"); LOG_TEE("slot unavailable\n");
// send error result // send error result
send_error(task.id, "slot unavaliable"); send_error(task.id, "slot unavailable");
return; return;
} }
@ -1416,6 +1390,7 @@ struct llama_server_context
slot->reset(); slot->reset();
slot->infill = task.infill_mode; slot->infill = task.infill_mode;
slot->embedding = task.embedding_mode;
slot->task_id = task.id; slot->task_id = task.id;
if (!launch_slot_with_data(slot, task.data)) if (!launch_slot_with_data(slot, task.data))
@ -1444,7 +1419,7 @@ struct llama_server_context
process_tasks(); process_tasks();
// update the system prompt wait until all slots are idle state // update the system prompt wait until all slots are idle state
if (system_need_update) if (system_need_update && all_slots_are_idle)
{ {
LOG_TEE("updating system prompt\n"); LOG_TEE("updating system prompt\n");
update_system_prompt(); update_system_prompt();
@ -1498,7 +1473,7 @@ struct llama_server_context
for (auto & slot : slots) for (auto & slot : slots)
{ {
// release the slot // release the slot
if (slot.state == PROCESSING && slot.command == RELEASE) if (slot.command == RELEASE)
{ {
slot.state = IDLE; slot.state = IDLE;
slot.command = NONE; slot.command = NONE;
@ -1509,7 +1484,7 @@ struct llama_server_context
continue; continue;
} }
if (slot.state == IDLE || slot.command == RELEASE) if (slot.state == IDLE)
{ {
continue; continue;
} }
@ -1530,6 +1505,17 @@ struct llama_server_context
{ {
for (auto & slot : slots) for (auto & slot : slots)
{ {
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
// empty prompt passed -> release the slot and send empty response
if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
{
slot.release();
slot.print_timings();
send_final_response(slot);
continue;
}
// need process the prompt // need process the prompt
if (slot.state == IDLE && slot.command == LOAD_PROMPT) if (slot.state == IDLE && slot.command == LOAD_PROMPT)
{ {
@ -1712,7 +1698,7 @@ struct llama_server_context
} }
// prompt evaluated for embedding // prompt evaluated for embedding
if (params.embedding) if (slot.embedding)
{ {
send_embedding(slot); send_embedding(slot);
slot.release(); slot.release();
@ -1749,8 +1735,8 @@ struct llama_server_context
if (!process_token(result, slot)) if (!process_token(result, slot))
{ {
slot.release(); slot.release();
send_final_response(slot);
slot.print_timings(); slot.print_timings();
send_final_response(slot);
} }
slot.i_batch = -1; slot.i_batch = -1;
@ -1769,9 +1755,16 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" -h, --help show this help message and exit\n"); printf(" -h, --help show this help message and exit\n");
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
printf(" --rope-scaling {none,linear,yarn}\n");
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n"); printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
@ -1893,6 +1886,19 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
} }
params.n_ctx = std::stoi(argv[i]); params.n_ctx = std::stoi(argv[i]);
} }
else if (arg == "--rope-scaling")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
std::string value(argv[i]);
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
else { invalid_param = true; break; }
}
else if (arg == "--rope-freq-base") else if (arg == "--rope-freq-base")
{ {
if (++i >= argc) if (++i >= argc)
@ -1911,6 +1917,38 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
} }
params.rope_freq_scale = std::stof(argv[i]); params.rope_freq_scale = std::stof(argv[i]);
} }
else if (arg == "--yarn-ext-factor")
{
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_ext_factor = std::stof(argv[i]);
}
else if (arg == "--yarn-attn-factor")
{
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_attn_factor = std::stof(argv[i]);
}
else if (arg == "--yarn-beta-fast")
{
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_beta_fast = std::stof(argv[i]);
}
else if (arg == "--yarn-beta-slow")
{
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_beta_slow = std::stof(argv[i]);
}
else if (arg == "--memory-f32" || arg == "--memory_f32") else if (arg == "--memory-f32" || arg == "--memory_f32")
{ {
params.memory_f16 = false; params.memory_f16 = false;
@ -1924,6 +1962,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
} }
params.n_threads = std::stoi(argv[i]); params.n_threads = std::stoi(argv[i]);
} }
else if (arg == "--threads-batch" || arg == "-tb")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.n_threads_batch = std::stoi(argv[i]);
}
else if (arg == "-b" || arg == "--batch-size") else if (arg == "-b" || arg == "--batch-size")
{ {
if (++i >= argc) if (++i >= argc)
@ -2216,8 +2263,8 @@ int main(int argc, char **argv)
llama_backend_init(params.numa); llama_backend_init(params.numa);
LOG_INFO("build info", {{"build", BUILD_NUMBER}, LOG_INFO("build info", {{"build", LLAMA_BUILD_NUMBER},
{"commit", BUILD_COMMIT}}); {"commit", LLAMA_COMMIT}});
LOG_INFO("system info", { LOG_INFO("system info", {
{"n_threads", params.n_threads}, {"n_threads", params.n_threads},
@ -2281,11 +2328,11 @@ int main(int argc, char **argv)
svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res) svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
{ {
json data = json::parse(req.body); json data = json::parse(req.body);
const int task_id = llama.request_completion(data, false); const int task_id = llama.request_completion(data, false, false);
if (!json_value(data, "stream", false)) { if (!json_value(data, "stream", false)) {
std::string completion_text; std::string completion_text;
task_result result = llama.next_result(task_id); task_result result = llama.next_result(task_id);
if(!result.error && result.stop) { if (!result.error && result.stop) {
res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json"); res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
} }
else else
@ -2312,7 +2359,7 @@ int main(int argc, char **argv)
{ {
return false; return false;
} }
if(result.stop) { if (result.stop) {
break; break;
} }
} else { } else {
@ -2336,7 +2383,7 @@ int main(int argc, char **argv)
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res) svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
{ {
json data = json::parse(req.body); json data = json::parse(req.body);
const int task_id = llama.request_completion(data, true); const int task_id = llama.request_completion(data, true, false);
if (!json_value(data, "stream", false)) { if (!json_value(data, "stream", false)) {
std::string completion_text; std::string completion_text;
task_result result = llama.next_result(task_id); task_result result = llama.next_result(task_id);
@ -2440,7 +2487,7 @@ int main(int argc, char **argv)
{ {
prompt = ""; prompt = "";
} }
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false); const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true);
task_result result = llama.next_result(task_id); task_result result = llama.next_result(task_id);
return res.set_content(result.result_json.dump(), "application/json"); return res.set_content(result.result_json.dump(), "application/json");
}); });

View file

@ -95,13 +95,8 @@ int main(int argc, char ** argv) {
llama_batch batch = llama_batch_init(512, 0, 1); llama_batch batch = llama_batch_init(512, 0, 1);
// evaluate the initial prompt // evaluate the initial prompt
batch.n_tokens = tokens_list.size(); for (size_t i = 0; i < tokens_list.size(); i++) {
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
for (int32_t i = 0; i < batch.n_tokens; i++) {
batch.token[i] = tokens_list[i];
batch.pos[i] = i;
batch.seq_id[i] = 0;
batch.logits[i] = false;
} }
// llama_decode will output logits only for the last token of the prompt // llama_decode will output logits only for the last token of the prompt
@ -148,15 +143,10 @@ int main(int argc, char ** argv) {
fflush(stdout); fflush(stdout);
// prepare the next batch // prepare the next batch
batch.n_tokens = 0; llama_batch_clear(batch);
// push this new token for next evaluation // push this new token for next evaluation
batch.token [batch.n_tokens] = new_token_id; llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
batch.pos [batch.n_tokens] = n_cur;
batch.seq_id[batch.n_tokens] = 0;
batch.logits[batch.n_tokens] = true;
batch.n_tokens += 1;
n_decode += 1; n_decode += 1;
} }

View file

@ -3,6 +3,3 @@ add_executable(${TARGET} speculative.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -1,5 +1,3 @@
#include "build-info.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
@ -8,6 +6,9 @@
#include <string> #include <string>
#include <vector> #include <vector>
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
struct seq_draft { struct seq_draft {
bool active = false; bool active = false;
bool drafting = false; bool drafting = false;
@ -36,9 +37,11 @@ int main(int argc, char ** argv) {
// max number of parallel drafting sequences (i.e. tree branches) // max number of parallel drafting sequences (i.e. tree branches)
const int n_seq_dft = params.n_parallel; const int n_seq_dft = params.n_parallel;
// TODO: make this configurable // probability threshold for accepting a token from the draft model
const float p_accept = 0.80f; const float p_accept = params.p_accept;
const float p_split = 0.10f;
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
const float p_split = params.p_split;
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("speculative", "log")); log_set_target(log_filename_generator("speculative", "log"));
@ -64,6 +67,33 @@ int main(int argc, char ** argv) {
params.n_gpu_layers = params.n_gpu_layers_draft; params.n_gpu_layers = params.n_gpu_layers_draft;
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params); std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
{
const int n_vocab_tgt = llama_n_vocab(model_tgt);
const int n_vocab_dft = llama_n_vocab(model_dft);
const int vocab_diff = n_vocab_tgt > n_vocab_dft
? n_vocab_tgt - n_vocab_dft
: n_vocab_dft - n_vocab_tgt;
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
return 1;
}
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
const char * token_text_dft = llama_token_get_text(model_dft, i);
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
llama_token_to_piece(ctx_tgt, i).c_str(),
llama_token_to_piece(ctx_dft, i).c_str());
return 1;
}
}
}
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> inp; std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx_tgt, params.prompt, true); inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
@ -118,7 +148,7 @@ int main(int argc, char ** argv) {
std::vector<seq_draft> drafts(n_seq_dft); std::vector<seq_draft> drafts(n_seq_dft);
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
params.sparams.temp = std::max(0.01f, params.sparams.temp); params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
for (int s = 0; s < n_seq_dft; ++s) { for (int s = 0; s < n_seq_dft; ++s) {
drafts[s].ctx_sampling = llama_sampling_init(params.sparams); drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
@ -227,6 +257,7 @@ int main(int argc, char ** argv) {
llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true); llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true);
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1); llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
// LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
llama_decode (ctx_dft, batch_dft); llama_decode (ctx_dft, batch_dft);
++n_past_dft; ++n_past_dft;
@ -370,7 +401,7 @@ int main(int argc, char ** argv) {
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1); llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
} }
//LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt)); // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
llama_decode(ctx_tgt, batch_tgt); llama_decode(ctx_tgt, batch_tgt);
++n_past_tgt; ++n_past_tgt;
} }

View file

@ -349,9 +349,9 @@ static struct ggml_tensor * llama_build_train_graphs(
// not capturing these, to silcence warnings // not capturing these, to silcence warnings
const int rope_mode = 0; const int rope_mode = 0;
return ggml_rope_custom(ctx, return ggml_rope_custom(
t, KQ_pos, n_rot, rope_mode, n_ctx, ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
rope_freq_base, rope_freq_scale); );
}; };
set_name(tokens_input, "tokens_input"); set_name(tokens_input, "tokens_input");

12
flake.lock generated
View file

@ -5,11 +5,11 @@
"systems": "systems" "systems": "systems"
}, },
"locked": { "locked": {
"lastModified": 1692799911, "lastModified": 1694529238,
"narHash": "sha256-3eihraek4qL744EvQXsK1Ha6C3CR7nnT8X2qWap4RNk=", "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
"owner": "numtide", "owner": "numtide",
"repo": "flake-utils", "repo": "flake-utils",
"rev": "f9e7cf818399d17d347f847525c5a5a8032e4e44", "rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -20,11 +20,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1692913444, "lastModified": 1698318101,
"narHash": "sha256-1SvMQm2DwofNxXVtNWWtIcTh7GctEVrS/Xel/mdc6iY=", "narHash": "sha256-gUihHt3yPD7bVqg+k/UVHgngyaJ3DMEBchbymBMvK1E=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "18324978d632ffc55ef1d928e81630c620f4f447", "rev": "63678e9f3d3afecfeafa0acead6239cdb447574c",
"type": "github" "type": "github"
}, },
"original": { "original": {

View file

@ -11,8 +11,7 @@
meta.mainProgram = "llama"; meta.mainProgram = "llama";
inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin; inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
buildInputs = with pkgs; [ openmpi ]; buildInputs = with pkgs; [ openmpi ];
osSpecific = with pkgs; buildInputs ++ osSpecific = with pkgs; buildInputs ++ (
(
if isAarch64 && isDarwin then if isAarch64 && isDarwin then
with pkgs.darwin.apple_sdk_11_0.frameworks; [ with pkgs.darwin.apple_sdk_11_0.frameworks; [
Accelerate Accelerate
@ -51,6 +50,9 @@
}; };
llama-python = llama-python =
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]); pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
llama-python-extra =
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece torchWithoutCuda transformers ]);
postPatch = '' postPatch = ''
substituteInPlace ./ggml-metal.m \ substituteInPlace ./ggml-metal.m \
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
@ -93,12 +95,15 @@
}; };
packages.rocm = pkgs.stdenv.mkDerivation { packages.rocm = pkgs.stdenv.mkDerivation {
inherit name src meta postPatch nativeBuildInputs postInstall; inherit name src meta postPatch nativeBuildInputs postInstall;
buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ]; buildInputs = with pkgs.rocmPackages; buildInputs ++ [ clr hipblas rocblas ];
cmakeFlags = cmakeFlags ++ [ cmakeFlags = cmakeFlags ++ [
"-DLLAMA_HIPBLAS=1" "-DLLAMA_HIPBLAS=1"
"-DCMAKE_C_COMPILER=hipcc" "-DCMAKE_C_COMPILER=hipcc"
"-DCMAKE_CXX_COMPILER=hipcc" "-DCMAKE_CXX_COMPILER=hipcc"
"-DCMAKE_POSITION_INDEPENDENT_CODE=ON" # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
# in github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
# and select the line that matches the current nixpkgs version of rocBLAS.
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
]; ];
}; };
apps.llama-server = { apps.llama-server = {
@ -126,5 +131,9 @@
buildInputs = [ llama-python ]; buildInputs = [ llama-python ];
packages = nativeBuildInputs ++ osSpecific; packages = nativeBuildInputs ++ osSpecific;
}; };
devShells.extra = pkgs.mkShell {
buildInputs = [ llama-python-extra ];
packages = nativeBuildInputs ++ osSpecific;
};
}); });
} }

File diff suppressed because it is too large Load diff

237
ggml-impl.h Normal file
View file

@ -0,0 +1,237 @@
#pragma once
#include "ggml.h"
// GGML internal header
#include <assert.h>
#include <stddef.h>
#include <stdbool.h>
#include <string.h> // memcpy
#include <math.h> // fabsf
#ifdef __cplusplus
extern "C" {
#endif
// static_assert should be a #define, but if it's not,
// fall back to the _Static_assert C11 keyword.
// if C99 - static_assert is noop
// ref: https://stackoverflow.com/a/53923785/4039976
#ifndef static_assert
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
#define static_assert(cond, msg) _Static_assert(cond, msg)
#else
#define static_assert(cond, msg) struct global_scope_noop_trick
#endif
#endif
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
#ifndef __FMA__
#define __FMA__
#endif
#ifndef __F16C__
#define __F16C__
#endif
#ifndef __SSE3__
#define __SSE3__
#endif
#endif
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
// 16-bit float
// on Arm, we use __fp16
// on x86, we use uint16_t
#if defined(__ARM_NEON) && !defined(_MSC_VER)
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
//
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
//
#include <arm_neon.h>
#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
#define GGML_FP16_TO_FP32(x) ((float) (x))
#define GGML_FP32_TO_FP16(x) (x)
#else
#ifdef __wasm_simd128__
#include <wasm_simd128.h>
#else
#ifdef __POWER9_VECTOR__
#include <altivec.h>
#undef bool
#define bool _Bool
#else
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <intrin.h>
#else
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
#if !defined(__riscv)
#include <immintrin.h>
#endif
#endif
#endif
#endif
#endif
#ifdef __riscv_v_intrinsic
#include <riscv_vector.h>
#endif
#ifdef __F16C__
#ifdef _MSC_VER
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
#else
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
#endif
#elif defined(__POWER9_VECTOR__)
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
/* the inline asm below is about 12% faster than the lookup method */
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
register float f;
register double d;
__asm__(
"mtfprd %0,%2\n"
"xscvhpdp %0,%0\n"
"frsp %1,%0\n" :
/* temp */ "=d"(d),
/* out */ "=f"(f):
/* in */ "r"(h));
return f;
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
register double d;
register ggml_fp16_t r;
__asm__( /* xscvdphp can work on double or single precision */
"xscvdphp %0,%2\n"
"mffprd %1,%0\n" :
/* temp */ "=d"(d),
/* out */ "=r"(r):
/* in */ "f"(f));
return r;
}
#else
// FP16 <-> FP32
// ref: https://github.com/Maratyszcza/FP16
static inline float fp32_from_bits(uint32_t w) {
union {
uint32_t as_bits;
float as_value;
} fp32;
fp32.as_bits = w;
return fp32.as_value;
}
static inline uint32_t fp32_to_bits(float f) {
union {
float as_value;
uint32_t as_bits;
} fp32;
fp32.as_value = f;
return fp32.as_bits;
}
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
const uint32_t w = (uint32_t) h << 16;
const uint32_t sign = w & UINT32_C(0x80000000);
const uint32_t two_w = w + w;
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const float exp_scale = 0x1.0p-112f;
#else
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
#endif
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
const uint32_t magic_mask = UINT32_C(126) << 23;
const float magic_bias = 0.5f;
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
const uint32_t result = sign |
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
return fp32_from_bits(result);
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const float scale_to_inf = 0x1.0p+112f;
const float scale_to_zero = 0x1.0p-110f;
#else
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
#endif
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
const uint32_t w = fp32_to_bits(f);
const uint32_t shl1_w = w + w;
const uint32_t sign = w & UINT32_C(0x80000000);
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
if (bias < UINT32_C(0x71000000)) {
bias = UINT32_C(0x71000000);
}
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
const uint32_t bits = fp32_to_bits(base);
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
const uint32_t nonsign = exp_bits + mantissa_bits;
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
}
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#endif // __F16C__
#endif // __ARM_NEON
// precomputed f32 table for f16 (256 KB)
// defined in ggml.c, initialized in ggml_init()
extern float ggml_table_f32_f16[1 << 16];
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
// This is also true for POWER9.
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
uint16_t s;
memcpy(&s, &f, sizeof(uint16_t));
return ggml_table_f32_f16[s];
}
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
#endif
// TODO: backend v2 PR
#ifdef __cplusplus
}
#endif

View file

@ -210,6 +210,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
if (sourcePath == nil) {
GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
sourcePath = @"ggml-metal.metal";
}
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]); GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error]; NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
if (error) { if (error) {
@ -234,12 +238,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
// load kernels // load kernels
{ {
NSError * error = nil; NSError * error = nil;
#define GGML_METAL_ADD_KERNEL(name) \
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ /*
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
(int) ctx->pipeline_##name.threadExecutionWidth); \ (int) ctx->pipeline_##name.threadExecutionWidth); \
*/
#define GGML_METAL_ADD_KERNEL(name) \
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
if (error) { \ if (error) { \
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
return NULL; \ return NULL; \
@ -994,11 +1001,15 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
{ {
const int nth = MIN(32, ne00); int nth = 32; // SIMD width
if (ne00%4 == 0) { if (ne00%4 == 0) {
[encoder setComputePipelineState:ctx->pipeline_soft_max_4]; [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
} else { } else {
do {
nth *= 2;
} while (nth <= ne00 && nth <= 1024);
nth /= 2;
[encoder setComputePipelineState:ctx->pipeline_soft_max]; [encoder setComputePipelineState:ctx->pipeline_soft_max];
} }
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@ -1006,8 +1017,9 @@ void ggml_metal_graph_compute(
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
[encoder setThreadgroupMemoryLength:MAX(16, nth/32*sizeof(float)) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:
{ {
@ -1336,7 +1348,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
[encoder setBytes:&eps length:sizeof( float) atIndex:4]; [encoder setBytes:&eps length:sizeof( float) atIndex:4];
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; [encoder setThreadgroupMemoryLength:MAX(16, nth*sizeof(float)) atIndex:0];
const int64_t nrows = ggml_nrows(src0); const int64_t nrows = ggml_nrows(src0);
@ -1391,11 +1403,16 @@ void ggml_metal_graph_compute(
const int n_past = ((int32_t *) dst->op_params)[0]; const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1]; const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2]; const int mode = ((int32_t *) dst->op_params)[2];
// skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
float freq_base; float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
float freq_scale; memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break; case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break;
@ -1425,8 +1442,13 @@ void ggml_metal_graph_compute(
[encoder setBytes:&n_past length:sizeof( int) atIndex:19]; [encoder setBytes:&n_past length:sizeof( int) atIndex:19];
[encoder setBytes:&n_dims length:sizeof( int) atIndex:20]; [encoder setBytes:&n_dims length:sizeof( int) atIndex:20];
[encoder setBytes:&mode length:sizeof( int) atIndex:21]; [encoder setBytes:&mode length:sizeof( int) atIndex:21];
[encoder setBytes:&freq_base length:sizeof(float) atIndex:22]; [encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:22];
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:23]; [encoder setBytes:&freq_base length:sizeof( float) atIndex:23];
[encoder setBytes:&freq_scale length:sizeof( float) atIndex:24];
[encoder setBytes:&ext_factor length:sizeof( float) atIndex:25];
[encoder setBytes:&attn_factor length:sizeof( float) atIndex:26];
[encoder setBytes:&beta_fast length:sizeof( float) atIndex:27];
[encoder setBytes:&beta_slow length:sizeof( float) atIndex:28];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;

View file

@ -184,36 +184,73 @@ kernel void kernel_soft_max(
constant int64_t & ne00, constant int64_t & ne00,
constant int64_t & ne01, constant int64_t & ne01,
constant int64_t & ne02, constant int64_t & ne02,
uint3 tgpig[[threadgroup_position_in_grid]], threadgroup float * buf [[threadgroup(0)]],
uint3 tpitg[[thread_position_in_threadgroup]], uint tgpig[[threadgroup_position_in_grid]],
uint3 ntg[[threads_per_threadgroup]]) { uint tpitg[[thread_position_in_threadgroup]],
const int64_t i03 = tgpig[2]; uint sgitg[[simdgroup_index_in_threadgroup]],
const int64_t i02 = tgpig[1]; uint tiisg[[thread_index_in_simdgroup]],
const int64_t i01 = tgpig[0]; uint ntg[[threads_per_threadgroup]]) {
const int64_t i03 = (tgpig) / (ne02*ne01);
const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
// parallel max // parallel max
float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY; float lmax = tpitg < ne00 ? psrc0[tpitg] : -INFINITY;
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
for (int i00 = tpitg + ntg; i00 < ne00; i00 += ntg) {
lmax = MAX(lmax, psrc0[i00]); lmax = MAX(lmax, psrc0[i00]);
} }
const float max = simd_max(lmax);
float max = simd_max(lmax);
if (tiisg == 0) {
buf[sgitg] = max;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
// broadcast, simd group number is ntg / 32
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
if (tpitg < i) {
buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
max = buf[0];
// parallel sum // parallel sum
float lsum = 0.0f; float lsum = 0.0f;
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
const float exp_psrc0 = exp(psrc0[i00] - max); const float exp_psrc0 = exp(psrc0[i00] - max);
lsum += exp_psrc0; lsum += exp_psrc0;
// Remember the result of exp here. exp is expensive, so we really do not // Remember the result of exp here. exp is expensive, so we really do not
// whish to compute it twice. // wish to compute it twice.
pdst[i00] = exp_psrc0; pdst[i00] = exp_psrc0;
} }
const float sum = simd_sum(lsum); float sum = simd_sum(lsum);
if (tiisg == 0) {
buf[sgitg] = sum;
}
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { threadgroup_barrier(mem_flags::mem_threadgroup);
// broadcast, simd group number is ntg / 32
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
if (tpitg < i) {
buf[tpitg] += buf[tpitg + i];
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
sum = buf[0];
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
pdst[i00] /= sum; pdst[i00] /= sum;
} }
} }
@ -224,37 +261,73 @@ kernel void kernel_soft_max_4(
constant int64_t & ne00, constant int64_t & ne00,
constant int64_t & ne01, constant int64_t & ne01,
constant int64_t & ne02, constant int64_t & ne02,
uint3 tgpig[[threadgroup_position_in_grid]], threadgroup float * buf [[threadgroup(0)]],
uint3 tpitg[[thread_position_in_threadgroup]], uint tgpig[[threadgroup_position_in_grid]],
uint3 ntg[[threads_per_threadgroup]]) { uint tpitg[[thread_position_in_threadgroup]],
const int64_t i03 = tgpig[2]; uint sgitg[[simdgroup_index_in_threadgroup]],
const int64_t i02 = tgpig[1]; uint tiisg[[thread_index_in_simdgroup]],
const int64_t i01 = tgpig[0]; uint ntg[[threads_per_threadgroup]]) {
const int64_t i03 = (tgpig) / (ne02*ne01);
const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
// parallel max // parallel max
float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY; float4 lmax4 = tpitg < ne00/4 ? psrc4[tpitg] : -INFINITY;
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
for (int i00 = tpitg + ntg; i00 < ne00/4; i00 += ntg) {
lmax4 = fmax(lmax4, psrc4[i00]); lmax4 = fmax(lmax4, psrc4[i00]);
} }
float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
const float max = simd_max(lmax); const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
float max = simd_max(lmax);
if (tiisg == 0) {
buf[sgitg] = max;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
// broadcast, simd group number is ntg / 32
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
if (tpitg < i) {
buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
max = buf[0];
// parallel sum // parallel sum
float4 lsum4 = 0.0f; float4 lsum4 = 0.0f;
for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) { for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
const float4 exp_psrc4 = exp(psrc4[i00] - max); const float4 exp_psrc4 = exp(psrc4[i00] - max);
lsum4 += exp_psrc4; lsum4 += exp_psrc4;
pdst4[i00] = exp_psrc4; pdst4[i00] = exp_psrc4;
} }
float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
const float sum = simd_sum(lsum); const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
float sum = simd_sum(lsum);
if (tiisg == 0) {
buf[sgitg] = sum;
}
for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) { threadgroup_barrier(mem_flags::mem_threadgroup);
// broadcast, simd group number is ntg / 32
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
if (tpitg < i) {
buf[tpitg] += buf[tpitg + i];
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
sum = buf[0];
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
pdst4[i00] /= sum; pdst4[i00] /= sum;
} }
} }
@ -988,6 +1061,45 @@ kernel void kernel_alibi_f32(
} }
} }
static float rope_yarn_ramp(const float low, const float high, const int i0) {
const float y = (i0 / 2 - low) / max(0.001f, high - low);
return 1.0f - min(1.0f, max(0.0f, y));
}
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
static void rope_yarn(
float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
thread float * cos_theta, thread float * sin_theta
) {
// Get n-d rotational scaling corrected for extrapolation
float theta_interp = freq_scale * theta_extrap;
float theta = theta_interp;
if (ext_factor != 0.0f) {
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
// Get n-d magnitude scaling corrected for interpolation
mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
}
*cos_theta = cos(theta) * mscale;
*sin_theta = sin(theta) * mscale;
}
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
static float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
return n_dims * log(n_orig_ctx / (n_rot * 2 * M_PI_F)) / (2 * log(base));
}
static void rope_yarn_corr_dims(
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
) {
// start and end correction dims
dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
}
typedef void (rope_t)( typedef void (rope_t)(
device const void * src0, device const void * src0,
device const int32_t * src1, device const int32_t * src1,
@ -1011,8 +1123,13 @@ typedef void (rope_t)(
constant int & n_past, constant int & n_past,
constant int & n_dims, constant int & n_dims,
constant int & mode, constant int & mode,
constant int & n_orig_ctx,
constant float & freq_base, constant float & freq_base,
constant float & freq_scale, constant float & freq_scale,
constant float & ext_factor,
constant float & attn_factor,
constant float & beta_fast,
constant float & beta_slow,
uint tiitg[[thread_index_in_threadgroup]], uint tiitg[[thread_index_in_threadgroup]],
uint3 tptg[[threads_per_threadgroup]], uint3 tptg[[threads_per_threadgroup]],
uint3 tgpig[[threadgroup_position_in_grid]]); uint3 tgpig[[threadgroup_position_in_grid]]);
@ -1041,8 +1158,13 @@ kernel void kernel_rope(
constant int & n_past, constant int & n_past,
constant int & n_dims, constant int & n_dims,
constant int & mode, constant int & mode,
constant int & n_orig_ctx,
constant float & freq_base, constant float & freq_base,
constant float & freq_scale, constant float & freq_scale,
constant float & ext_factor,
constant float & attn_factor,
constant float & beta_fast,
constant float & beta_slow,
uint tiitg[[thread_index_in_threadgroup]], uint tiitg[[thread_index_in_threadgroup]],
uint3 tptg[[threads_per_threadgroup]], uint3 tptg[[threads_per_threadgroup]],
uint3 tgpig[[threadgroup_position_in_grid]]) { uint3 tgpig[[threadgroup_position_in_grid]]) {
@ -1052,19 +1174,22 @@ kernel void kernel_rope(
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
float corr_dims[2];
rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
device const int32_t * pos = src1; device const int32_t * pos = src1;
const int64_t p = pos[i2]; const int64_t p = pos[i2];
const float theta_0 = freq_scale * (float)p; const float theta_0 = (float)p;
const float inv_ndims = -1.f/n_dims; const float inv_ndims = -1.f/n_dims;
if (!is_neox) { if (!is_neox) {
for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) { for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
const float theta = theta_0 * pow(freq_base, inv_ndims*i0); const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
const float cos_theta = cos(theta); float cos_theta, sin_theta;
const float sin_theta = sin(theta); rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
@ -1079,9 +1204,12 @@ kernel void kernel_rope(
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) { for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib); // simplified from `(ib * n_dims + ic) * inv_ndims`
const float cos_theta = cos(theta); const float cur_rot = inv_ndims*ic - ib;
const float sin_theta = sin(theta);
const float theta = theta_0 * pow(freq_base, cur_rot);
float cos_theta, sin_theta;
rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
const int64_t i0 = ib*n_dims + ic/2; const int64_t i0 = ib*n_dims + ic/2;

File diff suppressed because it is too large Load diff

View file

@ -1,11 +1,63 @@
#pragma once #pragma once
#include "ggml.h" #include "ggml-impl.h"
// GGML internal header
#include <stdint.h> #include <stdint.h>
#include <assert.h>
#include <stddef.h> #include <stddef.h>
#define QK4_0 32
typedef struct {
ggml_fp16_t d; // delta
uint8_t qs[QK4_0 / 2]; // nibbles / quants
} block_q4_0;
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
#define QK4_1 32
typedef struct {
ggml_fp16_t d; // delta
ggml_fp16_t m; // min
uint8_t qs[QK4_1 / 2]; // nibbles / quants
} block_q4_1;
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
#define QK5_0 32
typedef struct {
ggml_fp16_t d; // delta
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_0 / 2]; // nibbles / quants
} block_q5_0;
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
#define QK5_1 32
typedef struct {
ggml_fp16_t d; // delta
ggml_fp16_t m; // min
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_1 / 2]; // nibbles / quants
} block_q5_1;
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
#define QK8_0 32
typedef struct {
ggml_fp16_t d; // delta
int8_t qs[QK8_0]; // quants
} block_q8_0;
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
#define QK8_1 32
typedef struct {
float d; // delta
float s; // d * sum(qs[i])
int8_t qs[QK8_1]; // quants
} block_q8_1;
static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
//
// Super-block quantization structures
//
// Super-block size // Super-block size
#ifdef GGML_QKK_64 #ifdef GGML_QKK_64
#define QK_K 64 #define QK_K 64
@ -15,18 +67,6 @@
#define K_SCALE_SIZE 12 #define K_SCALE_SIZE 12
#endif #endif
#ifndef static_assert
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
#define static_assert(cond, msg) _Static_assert(cond, msg)
#else
#define static_assert(cond, msg) struct global_scope_noop_trick
#endif
#endif
//
// Super-block quantization structures
//
// 2-bit quantization // 2-bit quantization
// weight is represented as x = a * q + b // weight is represented as x = a * q + b
// 16 blocks of 16 elements each // 16 blocks of 16 elements each
@ -127,6 +167,13 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
// Quantization // Quantization
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k); void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k); void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k); void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
@ -134,6 +181,13 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k); void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k); void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k); void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
@ -142,6 +196,13 @@ void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
// Dequantization // Dequantization
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k); void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k); void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k); void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
@ -150,16 +211,14 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
// Dot product // Dot product
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
// Quantization with histogram collection
size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);

3332
ggml.c

File diff suppressed because it is too large Load diff

44
ggml.h
View file

@ -219,7 +219,7 @@
#define GGML_MAX_CONTEXTS 64 #define GGML_MAX_CONTEXTS 64
#define GGML_MAX_SRC 6 #define GGML_MAX_SRC 6
#define GGML_MAX_NAME 64 #define GGML_MAX_NAME 64
#define GGML_MAX_OP_PARAMS 32 #define GGML_MAX_OP_PARAMS 64
#define GGML_DEFAULT_N_THREADS 4 #define GGML_DEFAULT_N_THREADS 4
#if UINTPTR_MAX == 0xFFFFFFFF #if UINTPTR_MAX == 0xFFFFFFFF
@ -401,15 +401,16 @@ extern "C" {
GGML_OP_ALIBI, GGML_OP_ALIBI,
GGML_OP_CLAMP, GGML_OP_CLAMP,
GGML_OP_CONV_1D, GGML_OP_CONV_1D,
GGML_OP_CONV_2D, GGML_OP_CONV_1D_STAGE_0, // internal
GGML_OP_CONV_1D_STAGE_1, // internal
GGML_OP_CONV_TRANSPOSE_1D, GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_CONV_2D,
GGML_OP_CONV_2D_STAGE_0, // internal
GGML_OP_CONV_2D_STAGE_1, // internal
GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D, GGML_OP_POOL_1D,
GGML_OP_POOL_2D, GGML_OP_POOL_2D,
GGML_OP_CONV_1D_STAGE_0, // internal
GGML_OP_CONV_1D_STAGE_1, // internal
GGML_OP_UPSCALE, // nearest interpolate GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_FLASH_ATTN, GGML_OP_FLASH_ATTN,
@ -708,7 +709,7 @@ extern "C" {
// Context tensor enumeration and lookup // Context tensor enumeration and lookup
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx); GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); GGML_API struct ggml_tensor * ggml_get_tensor (struct ggml_context * ctx, const char * name);
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
@ -1020,9 +1021,9 @@ extern "C" {
struct ggml_tensor * b, struct ggml_tensor * b,
float eps); float eps);
// A: n columns, m rows // A: k columns, n rows => [ne03, ne02, n, k]
// B: n columns, p rows (i.e. we transpose it internally) // B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
// result is m columns, p rows // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
GGML_API struct ggml_tensor * ggml_mul_mat( GGML_API struct ggml_tensor * ggml_mul_mat(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -1325,8 +1326,13 @@ extern "C" {
int n_dims, int n_dims,
int mode, int mode,
int n_ctx, int n_ctx,
int n_orig_ctx,
float freq_base, float freq_base,
float freq_scale); float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow);
// in-place, returns view(a) // in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_rope_custom_inplace( GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
@ -1336,8 +1342,17 @@ extern "C" {
int n_dims, int n_dims,
int mode, int mode,
int n_ctx, int n_ctx,
int n_orig_ctx,
float freq_base, float freq_base,
float freq_scale); float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow);
// compute correction dims for YaRN RoPE scaling
void ggml_rope_yarn_corr_dims(
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
// xPos RoPE, in-place, returns view(a) // xPos RoPE, in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace( GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
@ -1929,12 +1944,19 @@ extern "C" {
// quantization // quantization
// //
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
// //

View file

@ -7,7 +7,7 @@ import shutil
import struct import struct
import sys import sys
import tempfile import tempfile
from enum import IntEnum, auto from enum import Enum, IntEnum, auto
from io import BufferedWriter from io import BufferedWriter
from pathlib import Path from pathlib import Path
from typing import IO, Any, BinaryIO, Callable, Sequence from typing import IO, Any, BinaryIO, Callable, Sequence
@ -55,7 +55,10 @@ KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
# RoPE # RoPE
KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count" KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
KEY_ROPE_FREQ_BASE = "{arch}.rope.freq_base" KEY_ROPE_FREQ_BASE = "{arch}.rope.freq_base"
KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear" KEY_ROPE_SCALING_TYPE = "{arch}.rope.scaling.type"
KEY_ROPE_SCALING_FACTOR = "{arch}.rope.scaling.factor"
KEY_ROPE_SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
KEY_ROPE_SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
# tokenization # tokenization
KEY_TOKENIZER_MODEL = "tokenizer.ggml.model" KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
@ -408,6 +411,7 @@ class TensorNameMap:
"layers.{bid}.attention_norm", # llama-pth "layers.{bid}.attention_norm", # llama-pth
"encoder.layer.{bid}.attention.output.LayerNorm", # bert "encoder.layer.{bid}.attention.output.LayerNorm", # bert
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon "language_model.encoder.layers.{bid}.input_layernorm", # persimmon
"model.layers.{bid}.ln1", # yi
), ),
# Attention norm 2 # Attention norm 2
@ -479,6 +483,7 @@ class TensorNameMap:
"layers.{bid}.ffn_norm", # llama-pth "layers.{bid}.ffn_norm", # llama-pth
"encoder.layer.{bid}.output.LayerNorm", # bert "encoder.layer.{bid}.output.LayerNorm", # bert
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
"model.layers.{bid}.ln2", # yi
), ),
# Feed-forward up # Feed-forward up
@ -595,6 +600,11 @@ class TokenType(IntEnum):
UNUSED = 5 UNUSED = 5
BYTE = 6 BYTE = 6
class RopeScalingType(Enum):
NONE = 'none'
LINEAR = 'linear'
YARN = 'yarn'
# #
# implementation # implementation
# #
@ -966,8 +976,17 @@ class GGUFWriter:
def add_rope_freq_base(self, value: float): def add_rope_freq_base(self, value: float):
self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value) self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value)
def add_rope_scale_linear(self, value: float): def add_rope_scaling_type(self, value: RopeScalingType):
self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value) self.add_string(KEY_ROPE_SCALING_TYPE.format(arch=self.arch), value.value)
def add_rope_scaling_factor(self, value: float):
self.add_float32(KEY_ROPE_SCALING_FACTOR.format(arch=self.arch), value)
def add_rope_scaling_orig_ctx_len(self, value: int):
self.add_uint32(KEY_ROPE_SCALING_ORIG_CTX_LEN.format(arch=self.arch), value)
def add_rope_scaling_finetuned(self, value: bool):
self.add_bool(KEY_ROPE_SCALING_FINETUNED.format(arch=self.arch), value)
def add_tokenizer_model(self, model: str): def add_tokenizer_model(self, model: str):
self.add_string(KEY_TOKENIZER_MODEL, model) self.add_string(KEY_TOKENIZER_MODEL, model)

4273
llama.cpp

File diff suppressed because it is too large Load diff

36
llama.h
View file

@ -106,6 +106,14 @@ extern "C" {
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
}; };
enum llama_rope_scaling_type {
LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
LLAMA_ROPE_SCALING_NONE = 0,
LLAMA_ROPE_SCALING_LINEAR = 1,
LLAMA_ROPE_SCALING_YARN = 2,
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
};
typedef struct llama_token_data { typedef struct llama_token_data {
llama_token id; // token id llama_token id; // token id
float logit; // log-odds of the token float logit; // log-odds of the token
@ -172,13 +180,19 @@ extern "C" {
uint32_t n_batch; // prompt processing maximum batch size uint32_t n_batch; // prompt processing maximum batch size
uint32_t n_threads; // number of threads to use for generation uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing uint32_t n_threads_batch; // number of threads to use for batch processing
int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
// ref: https://github.com/ggerganov/llama.cpp/pull/2054 // ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency, 0 = from model float rope_freq_base; // RoPE base frequency, 0 = from model
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
float yarn_attn_factor; // YaRN magnitude scaling factor
float yarn_beta_fast; // YaRN low correction dim
float yarn_beta_slow; // YaRN high correction dim
uint32_t yarn_orig_ctx; // YaRN original context size
// Keep the booleans together to avoid misalignment during copy-by-value. // Keep the booleans together to avoid misalignment during copy-by-value.
bool mul_mat_q; // if true, use experimental mul_mat_q kernels bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
bool f16_kv; // use fp16 for KV cache, fp32 otherwise bool f16_kv; // use fp16 for KV cache, fp32 otherwise
bool logits_all; // the llama_eval() call computes all logits, not just the last one bool logits_all; // the llama_eval() call computes all logits, not just the last one
bool embedding; // embedding mode only bool embedding; // embedding mode only
@ -191,6 +205,7 @@ extern "C" {
bool allow_requantize; // allow quantizing non-f32/f16 tensors bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
} llama_model_quantize_params; } llama_model_quantize_params;
// grammar types // grammar types
@ -333,15 +348,12 @@ extern "C" {
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx), LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
"avoid using this, it will be removed in the future, instead - count the tokens in user code"); "avoid using this, it will be removed in the future, instead - count the tokens in user code");
// Remove all tokens data of cells in [c0, c1) // Clear the KV cache
// c0 < 0 : [0, c1] LLAMA_API void llama_kv_cache_clear(
// c1 < 0 : [c0, inf) struct llama_context * ctx);
LLAMA_API void llama_kv_cache_tokens_rm(
struct llama_context * ctx,
int32_t c0,
int32_t c1);
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1) // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
// seq_id < 0 : match any sequence
// p0 < 0 : [0, p1] // p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf) // p1 < 0 : [p0, inf)
LLAMA_API void llama_kv_cache_seq_rm( LLAMA_API void llama_kv_cache_seq_rm(
@ -600,6 +612,13 @@ extern "C" {
float p, float p,
size_t min_keep); size_t min_keep);
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
LLAMA_API void llama_sample_min_p(
struct llama_context * ctx,
llama_token_data_array * candidates,
float p,
size_t min_keep);
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
LLAMA_API void llama_sample_tail_free( LLAMA_API void llama_sample_tail_free(
struct llama_context * ctx, struct llama_context * ctx,
@ -658,6 +677,7 @@ extern "C" {
float * mu); float * mu);
/// @details Selects the token with the highest probability. /// @details Selects the token with the highest probability.
/// Does not compute the token probabilities. Use llama_sample_softmax() instead.
LLAMA_API llama_token llama_sample_token_greedy( LLAMA_API llama_token llama_sample_token_greedy(
struct llama_context * ctx, struct llama_context * ctx,
llama_token_data_array * candidates); llama_token_data_array * candidates);

Binary file not shown.

View file

@ -1,5 +1,5 @@
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.h.in") set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h") set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
set(BUILD_NUMBER 0) set(BUILD_NUMBER 0)
set(BUILD_COMMIT "unknown") set(BUILD_COMMIT "unknown")
set(BUILD_COMPILER "unknown") set(BUILD_COMPILER "unknown")
@ -24,15 +24,21 @@ if(Git_FOUND)
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE HEAD OUTPUT_VARIABLE HEAD
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RES
) )
if (RES EQUAL 0)
set(BUILD_COMMIT ${HEAD})
endif()
execute_process( execute_process(
COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE COUNT OUTPUT_VARIABLE COUNT
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RES
) )
set(BUILD_COMMIT ${HEAD}) if (RES EQUAL 0)
set(BUILD_NUMBER ${COUNT}) set(BUILD_NUMBER ${COUNT})
endif()
endif() endif()
if(MSVC) if(MSVC)
@ -53,22 +59,22 @@ else()
set(BUILD_TARGET ${OUT}) set(BUILD_TARGET ${OUT})
endif() endif()
# Only write the header if it's changed to prevent unnecessary recompilation # Only write the build info if it changed
if(EXISTS ${HEADER_FILE}) if(EXISTS ${OUTPUT_FILE})
file(READ ${HEADER_FILE} CONTENTS) file(READ ${OUTPUT_FILE} CONTENTS)
string(REGEX MATCH "BUILD_COMMIT \"([^\"]*)\"" _ ${CONTENTS}) string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
set(OLD_COMMIT ${CMAKE_MATCH_1}) set(OLD_COMMIT ${CMAKE_MATCH_1})
string(REGEX MATCH "BUILD_COMPILER \"([^\"]*)\"" _ ${CONTENTS}) string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
set(OLD_COMPILER ${CMAKE_MATCH_1}) set(OLD_COMPILER ${CMAKE_MATCH_1})
string(REGEX MATCH "BUILD_TARGET \"([^\"]*)\"" _ ${CONTENTS}) string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
set(OLD_TARGET ${CMAKE_MATCH_1}) set(OLD_TARGET ${CMAKE_MATCH_1})
if ( if (
NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
NOT OLD_TARGET STREQUAL BUILD_TARGET NOT OLD_TARGET STREQUAL BUILD_TARGET
) )
configure_file(${TEMPLATE_FILE} ${HEADER_FILE}) configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
endif() endif()
else() else()
configure_file(${TEMPLATE_FILE} ${HEADER_FILE}) configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
endif() endif()

View file

@ -1,9 +0,0 @@
#ifndef BUILD_INFO_H
#define BUILD_INFO_H
#define BUILD_NUMBER @BUILD_NUMBER@
#define BUILD_COMMIT "@BUILD_COMMIT@"
#define BUILD_COMPILER "@BUILD_COMPILER@"
#define BUILD_TARGET "@BUILD_TARGET@"
#endif // BUILD_INFO_H

View file

@ -24,12 +24,7 @@ if out=$($CC -dumpmachine); then
build_target=$out build_target=$out
fi fi
echo "#ifndef BUILD_INFO_H" echo "int LLAMA_BUILD_NUMBER = ${build_number};"
echo "#define BUILD_INFO_H" echo "char const *LLAMA_COMMIT = \"${build_commit}\";"
echo echo "char const *LLAMA_COMPILER = \"${build_compiler}\";"
echo "#define BUILD_NUMBER $build_number" echo "char const *LLAMA_BUILD_TARGET = \"${build_target}\";"
echo "#define BUILD_COMMIT \"$build_commit\""
echo "#define BUILD_COMPILER \"$build_compiler\""
echo "#define BUILD_TARGET \"$build_target\""
echo
echo "#endif // BUILD_INFO_H"

391
scripts/server-llm.sh Normal file
View file

@ -0,0 +1,391 @@
#!/bin/bash
#
# Helper script for deploying llama.cpp server with a single Bash command
#
# - Works on Linux and macOS
# - Supports: CPU, CUDA, Metal, OpenCL
# - Can run all GGUF models from HuggingFace
# - Can serve requests in parallel
# - Always builds latest llama.cpp from GitHub
#
# Limitations
#
# - Chat templates are poorly supported (base models recommended)
# - Might be unstable!
#
# Usage:
# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
#
# --port: port number, default is 8888
# --repo: path to a repo containing GGUF model files
# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input
# --backend: cpu, cuda, metal, opencl, depends on the OS
# --gpu-id: gpu id, default is 0
# --n-parallel: number of parallel requests, default is 8
# --n-kv: KV cache size, default is 4096
# --verbose: verbose output
#
# Example:
#
# bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
#
set -e
# required utils: curl, git, make
if ! command -v curl &> /dev/null; then
printf "[-] curl not found\n"
exit 1
fi
if ! command -v git &> /dev/null; then
printf "[-] git not found\n"
exit 1
fi
if ! command -v make &> /dev/null; then
printf "[-] make not found\n"
exit 1
fi
# parse arguments
port=8888
repo=""
wtype=""
backend="cpu"
# if macOS, use metal backend by default
if [[ "$OSTYPE" == "darwin"* ]]; then
backend="metal"
elif command -v nvcc &> /dev/null; then
backend="cuda"
fi
gpu_id=0
n_parallel=8
n_kv=4096
verbose=0
function print_usage {
printf "Usage:\n"
printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
printf " --port: port number, default is 8888\n"
printf " --repo: path to a repo containing GGUF model files\n"
printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n"
printf " --gpu-id: gpu id, default is 0\n"
printf " --n-parallel: number of parallel requests, default is 8\n"
printf " --n-kv: KV cache size, default is 4096\n"
printf " --verbose: verbose output\n\n"
printf "Example:\n\n"
printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
}
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
--port)
port="$2"
shift
shift
;;
--repo)
repo="$2"
shift
shift
;;
--wtype)
wtype="$2"
shift
shift
;;
--backend)
backend="$2"
shift
shift
;;
--gpu-id)
gpu_id="$2"
shift
shift
;;
--n-parallel)
n_parallel="$2"
shift
shift
;;
--n-kv)
n_kv="$2"
shift
shift
;;
--verbose)
verbose=1
shift
;;
--help)
print_usage
exit 0
;;
*)
echo "Unknown argument: $key"
print_usage
exit 1
;;
esac
done
# available weights types
wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
wfiles=()
for wt in "${wtypes[@]}"; do
wfiles+=("")
done
# sample repos
repos=(
"https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
"https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
"https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
"https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
"https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
"https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
"https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
"https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
"https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
"https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
)
printf "\n"
printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
printf " Based on the options that follow, the script might download a model file\n"
printf " from the internet, which can be a few GBs in size. The script will also\n"
printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n"
printf "\n"
printf " Upon success, an HTTP server will be started and it will serve the selected\n"
printf " model using llama.cpp for demonstration purposes.\n"
printf "\n"
printf " Please note:\n"
printf "\n"
printf " - All new data will be stored in the current folder\n"
printf " - The server will be listening on all network interfaces\n"
printf " - The server will run with default settings which are not always optimal\n"
printf " - Do not judge the quality of a model based on the results from this script\n"
printf " - Do not use this script to benchmark llama.cpp\n"
printf " - Do not use this script in production\n"
printf " - This script is only for demonstration purposes\n"
printf "\n"
printf " If you don't know what you are doing, please press Ctrl-C to abort now\n"
printf "\n"
printf " Press Enter to continue ...\n\n"
read
if [[ -z "$repo" ]]; then
printf "[+] No repo provided from the command line\n"
printf " Please select a number from the list below or enter an URL:\n\n"
is=0
for r in "${repos[@]}"; do
printf " %2d) %s\n" $is "$r"
is=$((is+1))
done
# ask for repo until index of sample repo is provided or an URL
while [[ -z "$repo" ]]; do
printf "\n Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
read -p "[+] Select repo: " repo
# check if the input is a number
if [[ "$repo" =~ ^[0-9]+$ ]]; then
if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
repo="${repos[$repo]}"
else
printf "[-] Invalid repo index: %s\n" "$repo"
repo=""
fi
elif [[ "$repo" =~ ^https?:// ]]; then
repo="$repo"
else
printf "[-] Invalid repo URL: %s\n" "$repo"
repo=""
fi
done
fi
# remove suffix
repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
printf "[+] Checking for GGUF model files in %s\n" "$repo"
# find GGUF files in the source
# TODO: better logic
model_tree="${repo%/}/tree/main"
model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
# list all files in the provided git repo
printf "[+] Model files:\n\n"
for file in $model_files; do
# determine iw by grepping the filename with wtypes
iw=-1
is=0
for wt in "${wtypes[@]}"; do
# uppercase
ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
if [[ "$ufile" =~ "$wt" ]]; then
iw=$is
break
fi
is=$((is+1))
done
if [[ $iw -eq -1 ]]; then
continue
fi
wfiles[$iw]="$file"
have=" "
if [[ -f "$file" ]]; then
have="*"
fi
printf " %2d) %s %s\n" $iw "$have" "$file"
done
# ask for weights type until provided and available
while [[ -z "$wtype" ]]; do
printf "\n"
read -p "[+] Select weight type: " wtype
wfile="${wfiles[$wtype]}"
if [[ -z "$wfile" ]]; then
printf "[-] Invalid weight type: %s\n" "$wtype"
wtype=""
fi
done
printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
url="${repo%/}/resolve/main/$wfile"
# check file if the model has been downloaded before
chk="$wfile.chk"
# check if we should download the file
# - if $wfile does not exist
# - if $wfile exists but $chk does not exist
# - if $wfile exists and $chk exists but $wfile is newer than $chk
# TODO: better logic using git lfs info
do_download=0
if [[ ! -f "$wfile" ]]; then
do_download=1
elif [[ ! -f "$chk" ]]; then
do_download=1
elif [[ "$wfile" -nt "$chk" ]]; then
do_download=1
fi
if [[ $do_download -eq 1 ]]; then
printf "[+] Downloading weights from %s\n" "$url"
# download the weights file
curl -o "$wfile" -# -L "$url"
# create a check file if successful
if [[ $? -eq 0 ]]; then
printf "[+] Creating check file %s\n" "$chk"
touch "$chk"
fi
else
printf "[+] Using cached weights %s\n" "$wfile"
fi
# get latest llama.cpp and build
printf "[+] Downloading latest llama.cpp\n"
llama_cpp_dir="__llama_cpp_port_${port}__"
if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
# if the dir exists and there isn't a file "__ggml_script__" in it, abort
printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
printf "[-] Please remove it and try again\n"
exit 1
elif [[ -d "$llama_cpp_dir" ]]; then
printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
printf "[+] Using cached llama.cpp\n"
cd "$llama_cpp_dir"
git reset --hard
git fetch
git checkout origin/master
cd ..
else
printf "[+] Cloning llama.cpp\n"
git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
fi
# mark that that the directory is made by this script
touch "$llama_cpp_dir/__ggml_script__"
if [[ $verbose -eq 1 ]]; then
set -x
fi
# build
cd "$llama_cpp_dir"
make clean
log="--silent"
if [[ $verbose -eq 1 ]]; then
log=""
fi
if [[ "$backend" == "cuda" ]]; then
printf "[+] Building with CUDA backend\n"
LLAMA_CUBLAS=1 make -j server $log
elif [[ "$backend" == "cpu" ]]; then
printf "[+] Building with CPU backend\n"
make -j server $log
elif [[ "$backend" == "metal" ]]; then
printf "[+] Building with Metal backend\n"
make -j server $log
elif [[ "$backend" == "opencl" ]]; then
printf "[+] Building with OpenCL backend\n"
LLAMA_CLBLAST=1 make -j server $log
else
printf "[-] Unknown backend: %s\n" "$backend"
exit 1
fi
# run the server
printf "[+] Running server\n"
args=""
if [[ "$backend" == "cuda" ]]; then
export CUDA_VISIBLE_DEVICES=$gpu_id
args="-ngl 999"
elif [[ "$backend" == "cpu" ]]; then
args="-ngl 0"
elif [[ "$backend" == "metal" ]]; then
args="-ngl 999"
elif [[ "$backend" == "opencl" ]]; then
args="-ngl 999"
else
printf "[-] Unknown backend: %s\n" "$backend"
exit 1
fi
if [[ $verbose -eq 1 ]]; then
args="$args --verbose"
fi
./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
exit 0

View file

@ -4,7 +4,7 @@
#undef NDEBUG #undef NDEBUG
#include <cassert> #include <cassert>
#if !defined(__riscv) && !defined(__s390__) #if !defined(__riscv) && !defined(__s390__) && !defined(__ARM_NEON)
#include <immintrin.h> #include <immintrin.h>
#endif #endif
#include <cmath> #include <cmath>

View file

@ -129,6 +129,13 @@ int main(int argc, char * argv[]) {
ggml_type type = (ggml_type) i; ggml_type type = (ggml_type) i;
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
// deprecated - skip
if (qfns.blck_size == 0) {
continue;
}
printf("Testing %s\n", ggml_type_name((ggml_type) i));
if (qfns.from_float && qfns.to_float) { if (qfns.from_float && qfns.to_float) {
const float total_error = total_quantization_error(qfns, test_size, test_data.data()); const float total_error = total_quantization_error(qfns, test_size, test_data.data());
const float max_quantization_error = const float max_quantization_error =