Merge branch 'master' into stablelm-support
This commit is contained in:
commit
8917767f56
71 changed files with 7311 additions and 6369 deletions
2
.github/ISSUE_TEMPLATE/bug.md
vendored
2
.github/ISSUE_TEMPLATE/bug.md
vendored
|
@ -1,7 +1,7 @@
|
||||||
---
|
---
|
||||||
name: Bug template
|
name: Bug template
|
||||||
about: Used to report bugs in llama.cpp
|
about: Used to report bugs in llama.cpp
|
||||||
labels: ["bug"]
|
labels: ["bug-unconfirmed"]
|
||||||
assignees: ''
|
assignees: ''
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -15,6 +15,7 @@
|
||||||
.DS_Store
|
.DS_Store
|
||||||
.build/
|
.build/
|
||||||
.cache/
|
.cache/
|
||||||
|
.ccls-cache/
|
||||||
.direnv/
|
.direnv/
|
||||||
.envrc
|
.envrc
|
||||||
.swiftpm
|
.swiftpm
|
||||||
|
@ -64,7 +65,7 @@ models-mnt
|
||||||
/parallel
|
/parallel
|
||||||
/train-text-from-scratch
|
/train-text-from-scratch
|
||||||
/vdot
|
/vdot
|
||||||
build-info.h
|
/common/build-info.cpp
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
compile_commands.json
|
compile_commands.json
|
||||||
CMakeSettings.json
|
CMakeSettings.json
|
||||||
|
|
|
@ -44,7 +44,7 @@ endif()
|
||||||
|
|
||||||
# general
|
# general
|
||||||
option(LLAMA_STATIC "llama: static link libraries" OFF)
|
option(LLAMA_STATIC "llama: static link libraries" OFF)
|
||||||
option(LLAMA_NATIVE "llama: enable -march=native flag" ON)
|
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
|
||||||
option(LLAMA_LTO "llama: enable link time optimization" OFF)
|
option(LLAMA_LTO "llama: enable link time optimization" OFF)
|
||||||
|
|
||||||
# debug
|
# debug
|
||||||
|
@ -82,6 +82,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||||
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
|
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
|
||||||
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
|
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
|
||||||
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||||
|
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
|
||||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||||
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||||
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
||||||
|
@ -93,46 +94,12 @@ option(LLAMA_CLBLAST "llama: use CLBlast"
|
||||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||||
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
||||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||||
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
|
||||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||||
|
|
||||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
|
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
|
||||||
|
|
||||||
#
|
|
||||||
# Build info header
|
|
||||||
#
|
|
||||||
|
|
||||||
# Generate initial build-info.h
|
|
||||||
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
|
||||||
|
|
||||||
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
|
|
||||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/.git")
|
|
||||||
|
|
||||||
# Is git submodule
|
|
||||||
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
|
||||||
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
|
|
||||||
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
|
|
||||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Add a custom target for build-info.h
|
|
||||||
add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
|
|
||||||
|
|
||||||
# Add a custom command to rebuild build-info.h when .git/index changes
|
|
||||||
add_custom_command(
|
|
||||||
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
|
|
||||||
COMMENT "Generating build details from Git"
|
|
||||||
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
|
|
||||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
|
||||||
DEPENDS "${GIT_DIR}/index"
|
|
||||||
VERBATIM
|
|
||||||
)
|
|
||||||
else()
|
|
||||||
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Compile flags
|
# Compile flags
|
||||||
#
|
#
|
||||||
|
@ -277,14 +244,9 @@ if (LLAMA_BLAS)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_K_QUANTS)
|
|
||||||
set(GGML_HEADERS_EXTRA k_quants.h)
|
|
||||||
set(GGML_SOURCES_EXTRA k_quants.c)
|
|
||||||
add_compile_definitions(GGML_USE_K_QUANTS)
|
|
||||||
if (LLAMA_QKK_64)
|
if (LLAMA_QKK_64)
|
||||||
add_compile_definitions(GGML_QKK_64)
|
add_compile_definitions(GGML_QKK_64)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
|
||||||
|
|
||||||
if (LLAMA_CUBLAS)
|
if (LLAMA_CUBLAS)
|
||||||
cmake_minimum_required(VERSION 3.17)
|
cmake_minimum_required(VERSION 3.17)
|
||||||
|
@ -305,6 +267,9 @@ if (LLAMA_CUBLAS)
|
||||||
if (LLAMA_CUDA_FORCE_DMMV)
|
if (LLAMA_CUDA_FORCE_DMMV)
|
||||||
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
||||||
endif()
|
endif()
|
||||||
|
if (LLAMA_CUDA_FORCE_MMQ)
|
||||||
|
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||||
|
endif()
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||||
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||||
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
||||||
|
@ -331,6 +296,7 @@ if (LLAMA_CUBLAS)
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
||||||
else()
|
else()
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||||
|
#set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||||
|
@ -404,6 +370,9 @@ if (LLAMA_HIPBLAS)
|
||||||
if (LLAMA_CUDA_FORCE_DMMV)
|
if (LLAMA_CUDA_FORCE_DMMV)
|
||||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
|
||||||
endif()
|
endif()
|
||||||
|
if (LLAMA_CUDA_FORCE_MMQ)
|
||||||
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
|
||||||
|
endif()
|
||||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||||
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||||
|
@ -665,6 +634,8 @@ add_library(ggml OBJECT
|
||||||
ggml-alloc.h
|
ggml-alloc.h
|
||||||
ggml-backend.c
|
ggml-backend.c
|
||||||
ggml-backend.h
|
ggml-backend.h
|
||||||
|
ggml-quants.c
|
||||||
|
ggml-quants.h
|
||||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||||
|
|
88
Makefile
88
Makefile
|
@ -342,13 +342,9 @@ else
|
||||||
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef LLAMA_NO_K_QUANTS
|
|
||||||
MK_CPPFLAGS += -DGGML_USE_K_QUANTS
|
|
||||||
OBJS += k_quants.o
|
|
||||||
ifdef LLAMA_QKK_64
|
ifdef LLAMA_QKK_64
|
||||||
MK_CPPFLAGS += -DGGML_QKK_64
|
MK_CPPFLAGS += -DGGML_QKK_64
|
||||||
endif
|
endif
|
||||||
endif
|
|
||||||
|
|
||||||
ifndef LLAMA_NO_ACCELERATE
|
ifndef LLAMA_NO_ACCELERATE
|
||||||
# Mac OS - include Accelerate framework.
|
# Mac OS - include Accelerate framework.
|
||||||
|
@ -397,6 +393,9 @@ endif # CUDA_DOCKER_ARCH
|
||||||
ifdef LLAMA_CUDA_FORCE_DMMV
|
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||||
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
|
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||||
endif # LLAMA_CUDA_FORCE_DMMV
|
endif # LLAMA_CUDA_FORCE_DMMV
|
||||||
|
ifdef LLAMA_CUDA_FORCE_MMQ
|
||||||
|
NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
|
||||||
|
endif # LLAMA_CUDA_FORCE_MMQ
|
||||||
ifdef LLAMA_CUDA_DMMV_X
|
ifdef LLAMA_CUDA_DMMV_X
|
||||||
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||||
else
|
else
|
||||||
|
@ -494,11 +493,6 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
endif # LLAMA_MPI
|
endif # LLAMA_MPI
|
||||||
|
|
||||||
ifndef LLAMA_NO_K_QUANTS
|
|
||||||
k_quants.o: k_quants.c k_quants.h
|
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
|
||||||
endif # LLAMA_NO_K_QUANTS
|
|
||||||
|
|
||||||
# combine build flags with cmdline overrides
|
# combine build flags with cmdline overrides
|
||||||
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
|
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
|
||||||
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
|
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
|
||||||
|
@ -539,13 +533,16 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
||||||
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
OBJS += ggml-alloc.o ggml-backend.o
|
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
|
||||||
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
|
||||||
|
|
||||||
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h
|
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
||||||
COMMON_DEPS = $(COMMON_H_DEPS) common.o sampling.o grammar-parser.o
|
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
|
||||||
|
|
||||||
common.o: common/common.cpp $(COMMON_H_DEPS)
|
common.o: common/common.cpp $(COMMON_H_DEPS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
@ -566,46 +563,46 @@ libllama.so: llama.o ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Examples
|
# Examples
|
||||||
#
|
#
|
||||||
|
|
||||||
main: examples/main/main.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
@echo
|
@echo
|
||||||
@echo '==== Run ./main -h for help. ===='
|
@echo '==== Run ./main -h for help. ===='
|
||||||
@echo
|
@echo
|
||||||
|
|
||||||
infill: examples/infill/infill.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
batched: examples/batched/batched.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
batched-bench: examples/batched-bench/batched-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
|
quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
|
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
|
||||||
|
|
||||||
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
|
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
|
||||||
|
@ -617,7 +614,7 @@ train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratc
|
||||||
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
@ -626,19 +623,19 @@ llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip
|
||||||
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
ifdef LLAMA_METAL
|
ifdef LLAMA_METAL
|
||||||
|
@ -651,7 +648,7 @@ swift: examples/batched.swift
|
||||||
(cd examples/batched.swift; make build)
|
(cd examples/batched.swift; make build)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
|
||||||
@sh scripts/build-info.sh $(CC) > $@.tmp
|
@sh scripts/build-info.sh $(CC) > $@.tmp
|
||||||
@if ! cmp -s $@.tmp $@; then \
|
@if ! cmp -s $@.tmp $@; then \
|
||||||
mv $@.tmp $@; \
|
mv $@.tmp $@; \
|
||||||
|
@ -659,13 +656,16 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||||
rm $@.tmp; \
|
rm $@.tmp; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
build-info.o: common/build-info.cpp
|
||||||
|
$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
|
||||||
|
|
||||||
#
|
#
|
||||||
# Tests
|
# Tests
|
||||||
#
|
#
|
||||||
|
|
||||||
tests: $(TEST_TARGETS)
|
tests: $(TEST_TARGETS)
|
||||||
|
|
||||||
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
|
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
run-benchmark-matmult: benchmark-matmult
|
run-benchmark-matmult: benchmark-matmult
|
||||||
|
@ -679,40 +679,40 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||||
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-c.o: tests/test-c.c llama.h
|
tests/test-c.o: tests/test-c.c llama.h
|
||||||
|
|
|
@ -42,13 +42,12 @@ let package = Package(
|
||||||
"llama.cpp",
|
"llama.cpp",
|
||||||
"ggml-alloc.c",
|
"ggml-alloc.c",
|
||||||
"ggml-backend.c",
|
"ggml-backend.c",
|
||||||
"k_quants.c",
|
"ggml-quants.c",
|
||||||
] + additionalSources,
|
] + additionalSources,
|
||||||
resources: resources,
|
resources: resources,
|
||||||
publicHeadersPath: "spm-headers",
|
publicHeadersPath: "spm-headers",
|
||||||
cSettings: [
|
cSettings: [
|
||||||
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||||
.define("GGML_USE_K_QUANTS"),
|
|
||||||
.define("GGML_USE_ACCELERATE")
|
.define("GGML_USE_ACCELERATE")
|
||||||
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||||
// We should consider add this in the future when we drop support for iOS 14
|
// We should consider add this in the future when we drop support for iOS 14
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
[](https://github.com/ggerganov/llama.cpp/actions)
|
|
||||||
[](https://opensource.org/licenses/MIT)
|
[](https://opensource.org/licenses/MIT)
|
||||||
|
|
||||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||||
|
@ -11,8 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||||
|
|
||||||
### Hot topics
|
### Hot topics
|
||||||
|
|
||||||
- LLaVA support: https://github.com/ggerganov/llama.cpp/pull/3436
|
- ⚠️ **Upcoming change that might break functionality. Help with testing is needed:** https://github.com/ggerganov/llama.cpp/pull/3912
|
||||||
- ‼️ BPE tokenizer update: existing Falcon and Starcoder `.gguf` models will need to be reconverted: [#3252](https://github.com/ggerganov/llama.cpp/pull/3252)
|
|
||||||
|
|
||||||
----
|
----
|
||||||
|
|
||||||
|
|
45
build.zig
45
build.zig
|
@ -10,7 +10,6 @@ const Maker = struct {
|
||||||
builder: *std.build.Builder,
|
builder: *std.build.Builder,
|
||||||
target: CrossTarget,
|
target: CrossTarget,
|
||||||
optimize: Mode,
|
optimize: Mode,
|
||||||
config_header: *ConfigHeader,
|
|
||||||
enable_lto: bool,
|
enable_lto: bool,
|
||||||
|
|
||||||
include_dirs: ArrayList([]const u8),
|
include_dirs: ArrayList([]const u8),
|
||||||
|
@ -41,26 +40,24 @@ const Maker = struct {
|
||||||
const commit_hash = try std.ChildProcess.exec(
|
const commit_hash = try std.ChildProcess.exec(
|
||||||
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
|
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
|
||||||
);
|
);
|
||||||
const config_header = builder.addConfigHeader(
|
try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
|
||||||
.{ .style = .blank, .include_path = "build-info.h" },
|
\\int LLAMA_BUILD_NUMBER = {};
|
||||||
.{
|
\\char const *LLAMA_COMMIT = "{s}";
|
||||||
.BUILD_NUMBER = 0,
|
\\char const *LLAMA_COMPILER = "Zig {s}";
|
||||||
.BUILD_COMMIT = commit_hash.stdout[0 .. commit_hash.stdout.len - 1], // omit newline
|
\\char const *LLAMA_BUILD_TARGET = "{s}";
|
||||||
.BUILD_COMPILER = builder.fmt("Zig {s}", .{zig_version}),
|
\\
|
||||||
.BUILD_TARGET = try target.allocDescription(builder.allocator),
|
, .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
|
||||||
},
|
|
||||||
);
|
|
||||||
var m = Maker{
|
var m = Maker{
|
||||||
.builder = builder,
|
.builder = builder,
|
||||||
.target = target,
|
.target = target,
|
||||||
.optimize = builder.standardOptimizeOption(.{}),
|
.optimize = builder.standardOptimizeOption(.{}),
|
||||||
.config_header = config_header,
|
|
||||||
.enable_lto = false,
|
.enable_lto = false,
|
||||||
.include_dirs = ArrayList([]const u8).init(builder.allocator),
|
.include_dirs = ArrayList([]const u8).init(builder.allocator),
|
||||||
.cflags = ArrayList([]const u8).init(builder.allocator),
|
.cflags = ArrayList([]const u8).init(builder.allocator),
|
||||||
.cxxflags = ArrayList([]const u8).init(builder.allocator),
|
.cxxflags = ArrayList([]const u8).init(builder.allocator),
|
||||||
.objs = ArrayList(*Compile).init(builder.allocator),
|
.objs = ArrayList(*Compile).init(builder.allocator),
|
||||||
};
|
};
|
||||||
|
|
||||||
try m.addCFlag("-std=c11");
|
try m.addCFlag("-std=c11");
|
||||||
try m.addCxxFlag("-std=c++11");
|
try m.addCxxFlag("-std=c++11");
|
||||||
try m.addProjectInclude(&.{});
|
try m.addProjectInclude(&.{});
|
||||||
|
@ -72,7 +69,7 @@ const Maker = struct {
|
||||||
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
||||||
if (o.target.getAbi() != .msvc)
|
if (o.target.getAbi() != .msvc)
|
||||||
o.defineCMacro("_GNU_SOURCE", null);
|
o.defineCMacro("_GNU_SOURCE", null);
|
||||||
o.addConfigHeader(m.config_header);
|
|
||||||
if (std.mem.endsWith(u8, src, ".c")) {
|
if (std.mem.endsWith(u8, src, ".c")) {
|
||||||
o.addCSourceFiles(&.{src}, m.cflags.items);
|
o.addCSourceFiles(&.{src}, m.cflags.items);
|
||||||
o.linkLibC();
|
o.linkLibC();
|
||||||
|
@ -85,7 +82,6 @@ const Maker = struct {
|
||||||
o.linkLibCpp();
|
o.linkLibCpp();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
o.addConfigHeader(m.config_header);
|
|
||||||
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
|
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
|
||||||
o.want_lto = m.enable_lto;
|
o.want_lto = m.enable_lto;
|
||||||
return o;
|
return o;
|
||||||
|
@ -105,7 +101,6 @@ const Maker = struct {
|
||||||
// linkLibCpp already add (libc++ + libunwind + libc)
|
// linkLibCpp already add (libc++ + libunwind + libc)
|
||||||
e.linkLibCpp();
|
e.linkLibCpp();
|
||||||
}
|
}
|
||||||
e.addConfigHeader(m.config_header);
|
|
||||||
m.builder.installArtifact(e);
|
m.builder.installArtifact(e);
|
||||||
e.want_lto = m.enable_lto;
|
e.want_lto = m.enable_lto;
|
||||||
return e;
|
return e;
|
||||||
|
@ -116,16 +111,12 @@ pub fn build(b: *std.build.Builder) !void {
|
||||||
var make = try Maker.init(b);
|
var make = try Maker.init(b);
|
||||||
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
|
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
|
||||||
|
|
||||||
if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
|
|
||||||
try make.addFlag("-DGGML_USE_K_QUANTS");
|
|
||||||
const k_quants = make.obj("k_quants", "k_quants.c");
|
|
||||||
try make.objs.append(k_quants);
|
|
||||||
}
|
|
||||||
|
|
||||||
const ggml = make.obj("ggml", "ggml.c");
|
const ggml = make.obj("ggml", "ggml.c");
|
||||||
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
|
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
|
||||||
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
||||||
|
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
|
||||||
const llama = make.obj("llama", "llama.cpp");
|
const llama = make.obj("llama", "llama.cpp");
|
||||||
|
const buildinfo = make.obj("common", "common/build-info.cpp");
|
||||||
const common = make.obj("common", "common/common.cpp");
|
const common = make.obj("common", "common/common.cpp");
|
||||||
const console = make.obj("console", "common/console.cpp");
|
const console = make.obj("console", "common/console.cpp");
|
||||||
const sampling = make.obj("sampling", "common/sampling.cpp");
|
const sampling = make.obj("sampling", "common/sampling.cpp");
|
||||||
|
@ -133,14 +124,14 @@ pub fn build(b: *std.build.Builder) !void {
|
||||||
const train = make.obj("train", "common/train.cpp");
|
const train = make.obj("train", "common/train.cpp");
|
||||||
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
||||||
|
|
||||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
|
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
|
||||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
|
||||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
|
||||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
|
||||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
|
||||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
|
||||||
|
|
||||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser, clip });
|
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
|
||||||
if (server.target.isWindows()) {
|
if (server.target.isWindows()) {
|
||||||
server.linkSystemLibrary("ws2_32");
|
server.linkSystemLibrary("ws2_32");
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,46 @@
|
||||||
# common
|
# common
|
||||||
|
|
||||||
|
|
||||||
|
# Build info header
|
||||||
|
#
|
||||||
|
|
||||||
|
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||||
|
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||||
|
|
||||||
|
# Is git submodule
|
||||||
|
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
||||||
|
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
|
||||||
|
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
|
||||||
|
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(GIT_INDEX "${GIT_DIR}/index")
|
||||||
|
else()
|
||||||
|
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
|
||||||
|
set(GIT_INDEX "")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Add a custom command to rebuild build-info.cpp when .git/index changes
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
|
||||||
|
COMMENT "Generating build details from Git"
|
||||||
|
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
||||||
|
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
||||||
|
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake"
|
||||||
|
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
||||||
|
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
||||||
|
VERBATIM
|
||||||
|
)
|
||||||
|
set(TARGET build_info)
|
||||||
|
add_library(${TARGET} OBJECT build-info.cpp)
|
||||||
|
if (BUILD_SHARED_LIBS)
|
||||||
|
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
set(TARGET common)
|
set(TARGET common)
|
||||||
|
|
||||||
add_library(${TARGET} OBJECT
|
add_library(${TARGET} STATIC
|
||||||
common.h
|
common.h
|
||||||
common.cpp
|
common.cpp
|
||||||
sampling.h
|
sampling.h
|
||||||
|
@ -21,4 +59,4 @@ endif()
|
||||||
|
|
||||||
target_include_directories(${TARGET} PUBLIC .)
|
target_include_directories(${TARGET} PUBLIC .)
|
||||||
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama)
|
target_link_libraries(${TARGET} PRIVATE llama build_info)
|
||||||
|
|
4
common/build-info.cpp.in
Normal file
4
common/build-info.cpp.in
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
|
||||||
|
char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
|
||||||
|
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
|
||||||
|
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
|
@ -1,5 +1,4 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "build-info.h"
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
@ -103,9 +102,24 @@ void process_escapes(std::string& input) {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
|
bool result = true;
|
||||||
|
try {
|
||||||
|
if (!gpt_params_parse_ex(argc, argv, params)) {
|
||||||
|
gpt_print_usage(argc, argv, gpt_params());
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (const std::invalid_argument & ex) {
|
||||||
|
fprintf(stderr, "%s\n", ex.what());
|
||||||
|
gpt_print_usage(argc, argv, gpt_params());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
bool invalid_param = false;
|
bool invalid_param = false;
|
||||||
std::string arg;
|
std::string arg;
|
||||||
gpt_params default_params;
|
|
||||||
const std::string arg_prefix = "--";
|
const std::string arg_prefix = "--";
|
||||||
llama_sampling_params & sparams = params.sparams;
|
llama_sampling_params & sparams = params.sparams;
|
||||||
|
|
||||||
|
@ -204,12 +218,52 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.rope_freq_scale = std::stof(argv[i]);
|
params.rope_freq_scale = std::stof(argv[i]);
|
||||||
|
} else if (arg == "--rope-scaling") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
std::string value(argv[i]);
|
||||||
|
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
|
||||||
|
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
|
||||||
|
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
|
||||||
|
else { invalid_param = true; break; }
|
||||||
} else if (arg == "--rope-scale") {
|
} else if (arg == "--rope-scale") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.rope_freq_scale = 1.0f/std::stof(argv[i]);
|
params.rope_freq_scale = 1.0f/std::stof(argv[i]);
|
||||||
|
} else if (arg == "--yarn-orig-ctx") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.yarn_orig_ctx = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--yarn-ext-factor") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.yarn_ext_factor = std::stof(argv[i]);
|
||||||
|
} else if (arg == "--yarn-attn-factor") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.yarn_attn_factor = std::stof(argv[i]);
|
||||||
|
} else if (arg == "--yarn-beta-fast") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.yarn_beta_fast = std::stof(argv[i]);
|
||||||
|
} else if (arg == "--yarn-beta-slow") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.yarn_beta_slow = std::stof(argv[i]);
|
||||||
} else if (arg == "--memory-f32") {
|
} else if (arg == "--memory-f32") {
|
||||||
params.memory_f16 = false;
|
params.memory_f16 = false;
|
||||||
} else if (arg == "--top-p") {
|
} else if (arg == "--top-p") {
|
||||||
|
@ -218,12 +272,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
sparams.top_p = std::stof(argv[i]);
|
sparams.top_p = std::stof(argv[i]);
|
||||||
|
} else if (arg == "--min-p") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
sparams.min_p = std::stof(argv[i]);
|
||||||
} else if (arg == "--temp") {
|
} else if (arg == "--temp") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
sparams.temp = std::stof(argv[i]);
|
sparams.temp = std::stof(argv[i]);
|
||||||
|
sparams.temp = std::max(sparams.temp, 0.0f);
|
||||||
} else if (arg == "--tfs") {
|
} else if (arg == "--tfs") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -342,6 +403,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_sequences = std::stoi(argv[i]);
|
params.n_sequences = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--p-accept" || arg == "-pa") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.p_accept = std::stof(argv[i]);
|
||||||
|
} else if (arg == "--p-split" || arg == "-ps") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.p_split = std::stof(argv[i]);
|
||||||
} else if (arg == "-m" || arg == "--model") {
|
} else if (arg == "-m" || arg == "--model") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -547,11 +620,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else if (arg == "-h" || arg == "--help") {
|
} else if (arg == "-h" || arg == "--help") {
|
||||||
gpt_print_usage(argc, argv, default_params);
|
return false;
|
||||||
#ifndef LOG_DISABLE_LOGS
|
|
||||||
log_print_usage();
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
exit(0);
|
|
||||||
} else if (arg == "--random-prompt") {
|
} else if (arg == "--random-prompt") {
|
||||||
params.random_prompt = true;
|
params.random_prompt = true;
|
||||||
} else if (arg == "--in-prefix-bos") {
|
} else if (arg == "--in-prefix-bos") {
|
||||||
|
@ -610,22 +680,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
// End of Parse args for logging parameters
|
// End of Parse args for logging parameters
|
||||||
#endif // LOG_DISABLE_LOGS
|
#endif // LOG_DISABLE_LOGS
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
throw std::invalid_argument("error: unknown argument: " + arg);
|
||||||
gpt_print_usage(argc, argv, default_params);
|
|
||||||
exit(1);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (invalid_param) {
|
if (invalid_param) {
|
||||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
||||||
gpt_print_usage(argc, argv, default_params);
|
|
||||||
exit(1);
|
|
||||||
}
|
}
|
||||||
if (params.prompt_cache_all &&
|
if (params.prompt_cache_all &&
|
||||||
(params.interactive || params.interactive_first ||
|
(params.interactive || params.interactive_first ||
|
||||||
params.instruct)) {
|
params.instruct)) {
|
||||||
fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
|
|
||||||
gpt_print_usage(argc, argv, default_params);
|
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
||||||
exit(1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.escape) {
|
if (params.escape) {
|
||||||
|
@ -644,6 +709,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
const llama_sampling_params & sparams = params.sparams;
|
const llama_sampling_params & sparams = params.sparams;
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
printf("usage: %s [options]\n", argv[0]);
|
printf("usage: %s [options]\n", argv[0]);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("options:\n");
|
printf("options:\n");
|
||||||
|
@ -678,6 +744,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
||||||
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
||||||
|
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
|
||||||
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
|
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
|
||||||
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
|
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
|
||||||
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
|
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
|
||||||
|
@ -700,9 +767,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" --cfg-negative-prompt-file FNAME\n");
|
printf(" --cfg-negative-prompt-file FNAME\n");
|
||||||
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
||||||
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
|
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
|
||||||
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
|
printf(" --rope-scaling {none,linear,yarn}\n");
|
||||||
|
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
|
||||||
|
printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n");
|
||||||
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
||||||
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
|
printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
|
||||||
|
printf(" --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)\n");
|
||||||
|
printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
|
||||||
|
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
|
||||||
|
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
|
||||||
|
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
||||||
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||||
printf(" --no-penalize-nl do not penalize newline token\n");
|
printf(" --no-penalize-nl do not penalize newline token\n");
|
||||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
|
@ -716,6 +790,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||||
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
|
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
|
||||||
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
|
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
|
||||||
|
printf(" -pa N, --p-accept N speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
|
||||||
|
printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
|
||||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
|
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
|
||||||
printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
|
printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
|
||||||
|
@ -743,7 +819,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
#endif
|
#endif
|
||||||
printf(" --verbose-prompt print prompt before generation\n");
|
printf(" --verbose-prompt print prompt before generation\n");
|
||||||
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||||
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
||||||
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||||
|
@ -754,6 +830,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -ld LOGDIR, --logdir LOGDIR\n");
|
printf(" -ld LOGDIR, --logdir LOGDIR\n");
|
||||||
printf(" path under which to save YAML logs (no logging if unset)\n");
|
printf(" path under which to save YAML logs (no logging if unset)\n");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
#ifndef LOG_DISABLE_LOGS
|
||||||
|
log_print_usage();
|
||||||
|
#endif // LOG_DISABLE_LOGS
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string get_system_info(const gpt_params & params) {
|
std::string get_system_info(const gpt_params & params) {
|
||||||
|
@ -816,8 +895,14 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
cparams.f16_kv = params.memory_f16;
|
cparams.f16_kv = params.memory_f16;
|
||||||
cparams.logits_all = params.logits_all;
|
cparams.logits_all = params.logits_all;
|
||||||
cparams.embedding = params.embedding;
|
cparams.embedding = params.embedding;
|
||||||
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
||||||
cparams.rope_freq_base = params.rope_freq_base;
|
cparams.rope_freq_base = params.rope_freq_base;
|
||||||
cparams.rope_freq_scale = params.rope_freq_scale;
|
cparams.rope_freq_scale = params.rope_freq_scale;
|
||||||
|
cparams.yarn_ext_factor = params.yarn_ext_factor;
|
||||||
|
cparams.yarn_attn_factor = params.yarn_attn_factor;
|
||||||
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
||||||
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
||||||
|
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
||||||
|
|
||||||
return cparams;
|
return cparams;
|
||||||
}
|
}
|
||||||
|
@ -888,7 +973,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
|
|
||||||
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
||||||
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
|
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
|
||||||
llama_kv_cache_tokens_rm(lctx, -1, -1);
|
llama_kv_cache_clear(lctx);
|
||||||
llama_reset_timings(lctx);
|
llama_reset_timings(lctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1127,8 +1212,8 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
||||||
const llama_sampling_params & sparams = params.sparams;
|
const llama_sampling_params & sparams = params.sparams;
|
||||||
|
|
||||||
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
|
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
||||||
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
|
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
|
||||||
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
||||||
|
@ -1274,6 +1359,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
||||||
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
||||||
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
||||||
|
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
||||||
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
||||||
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
#define LOG_NO_FILE_LINE_FUNCTION
|
#define LOG_NO_FILE_LINE_FUNCTION
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <random>
|
#include <random>
|
||||||
|
@ -26,10 +27,16 @@
|
||||||
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
||||||
|
|
||||||
#define print_build_info() do { \
|
#define print_build_info() do { \
|
||||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \
|
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
|
||||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \
|
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
|
// build info
|
||||||
|
extern int LLAMA_BUILD_NUMBER;
|
||||||
|
extern char const *LLAMA_COMMIT;
|
||||||
|
extern char const *LLAMA_COMPILER;
|
||||||
|
extern char const *LLAMA_BUILD_TARGET;
|
||||||
|
|
||||||
//
|
//
|
||||||
// CLI argument parsing
|
// CLI argument parsing
|
||||||
//
|
//
|
||||||
|
@ -37,6 +44,7 @@ int32_t get_num_physical_cores();
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
uint32_t seed = -1; // RNG seed
|
uint32_t seed = -1; // RNG seed
|
||||||
|
|
||||||
int32_t n_threads = get_num_physical_cores();
|
int32_t n_threads = get_num_physical_cores();
|
||||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
|
@ -47,6 +55,8 @@ struct gpt_params {
|
||||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||||
int32_t n_sequences = 1; // number of sequences to decode
|
int32_t n_sequences = 1; // number of sequences to decode
|
||||||
|
float p_accept = 0.5f; // speculative decoding accept probability
|
||||||
|
float p_split = 0.1f; // speculative decoding split probability
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
|
@ -54,6 +64,13 @@ struct gpt_params {
|
||||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||||
float rope_freq_base = 0.0f; // RoPE base frequency
|
float rope_freq_base = 0.0f; // RoPE base frequency
|
||||||
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||||
|
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
|
||||||
|
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
|
||||||
|
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||||
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||||
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
|
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
|
||||||
|
// pinging @cebtenzzre
|
||||||
|
|
||||||
// // sampling parameters
|
// // sampling parameters
|
||||||
struct llama_sampling_params sparams;
|
struct llama_sampling_params sparams;
|
||||||
|
@ -110,6 +127,8 @@ struct gpt_params {
|
||||||
std::string image = ""; // path to an image file
|
std::string image = ""; // path to an image file
|
||||||
};
|
};
|
||||||
|
|
||||||
|
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
|
||||||
|
|
||||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
||||||
|
|
||||||
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
||||||
|
|
87
common/log.h
87
common/log.h
|
@ -97,6 +97,15 @@
|
||||||
#define LOG_TEE_TARGET stderr
|
#define LOG_TEE_TARGET stderr
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Utility for synchronizing log configuration state
|
||||||
|
// since std::optional was introduced only in c++17
|
||||||
|
enum LogTriState
|
||||||
|
{
|
||||||
|
LogTriStateSame,
|
||||||
|
LogTriStateFalse,
|
||||||
|
LogTriStateTrue
|
||||||
|
};
|
||||||
|
|
||||||
// Utility to obtain "pid" like unique process id and use it when creating log files.
|
// Utility to obtain "pid" like unique process id and use it when creating log files.
|
||||||
inline std::string log_get_pid()
|
inline std::string log_get_pid()
|
||||||
{
|
{
|
||||||
|
@ -118,16 +127,26 @@ inline std::string log_get_pid()
|
||||||
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
|
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
|
||||||
// where the number is a runtime id of the current thread.
|
// where the number is a runtime id of the current thread.
|
||||||
|
|
||||||
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension)
|
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
// INTERNAL, DO NOT USE
|
||||||
inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension)
|
inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
|
||||||
{
|
{
|
||||||
|
static bool _multilog = false;
|
||||||
|
|
||||||
|
if (multilog != LogTriStateSame)
|
||||||
|
{
|
||||||
|
_multilog = multilog == LogTriStateTrue;
|
||||||
|
}
|
||||||
|
|
||||||
std::stringstream buf;
|
std::stringstream buf;
|
||||||
|
|
||||||
buf << log_file_basename;
|
buf << log_file_basename;
|
||||||
|
if (_multilog)
|
||||||
|
{
|
||||||
buf << ".";
|
buf << ".";
|
||||||
buf << log_get_pid();
|
buf << log_get_pid();
|
||||||
|
}
|
||||||
buf << ".";
|
buf << ".";
|
||||||
buf << log_file_extension;
|
buf << log_file_extension;
|
||||||
|
|
||||||
|
@ -212,15 +231,6 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
|
||||||
#define LOG_TEE_FLF_VAL ,""
|
#define LOG_TEE_FLF_VAL ,""
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Utility for synchronizing log configuration state
|
|
||||||
// since std::optional was introduced only in c++17
|
|
||||||
enum LogTriState
|
|
||||||
{
|
|
||||||
LogTriStateSame,
|
|
||||||
LogTriStateFalse,
|
|
||||||
LogTriStateTrue
|
|
||||||
};
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
// INTERNAL, DO NOT USE
|
||||||
// USE LOG() INSTEAD
|
// USE LOG() INSTEAD
|
||||||
//
|
//
|
||||||
|
@ -314,16 +324,23 @@ enum LogTriState
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
// INTERNAL, DO NOT USE
|
||||||
inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
|
inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
|
||||||
{
|
{
|
||||||
static bool _initialized{false};
|
static bool _initialized = false;
|
||||||
static bool _disabled{(filename.empty() && target == nullptr)};
|
static bool _append = false;
|
||||||
|
static bool _disabled = filename.empty() && target == nullptr;
|
||||||
static std::string log_current_filename{filename};
|
static std::string log_current_filename{filename};
|
||||||
static FILE *log_current_target{target};
|
static FILE *log_current_target{target};
|
||||||
static FILE *logfile = nullptr;
|
static FILE *logfile = nullptr;
|
||||||
|
|
||||||
if (change)
|
if (change)
|
||||||
{
|
{
|
||||||
|
if (append != LogTriStateSame)
|
||||||
|
{
|
||||||
|
_append = append == LogTriStateTrue;
|
||||||
|
return logfile;
|
||||||
|
}
|
||||||
|
|
||||||
if (disable == LogTriStateTrue)
|
if (disable == LogTriStateTrue)
|
||||||
{
|
{
|
||||||
// Disable primary target
|
// Disable primary target
|
||||||
|
@ -376,7 +393,7 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logfile = fopen(filename.c_str(), "w");
|
logfile = fopen(filename.c_str(), _append ? "a" : "w");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!logfile)
|
if (!logfile)
|
||||||
|
@ -397,9 +414,9 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri
|
||||||
}
|
}
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
// INTERNAL, DO NOT USE
|
||||||
inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
|
inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
|
||||||
{
|
{
|
||||||
return log_handler1_impl(change, disable, filename, target);
|
return log_handler1_impl(change, append, disable, filename, target);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Disables logs entirely at runtime.
|
// Disables logs entirely at runtime.
|
||||||
|
@ -410,7 +427,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTri
|
||||||
// INTERNAL, DO NOT USE
|
// INTERNAL, DO NOT USE
|
||||||
inline FILE *log_disable_impl()
|
inline FILE *log_disable_impl()
|
||||||
{
|
{
|
||||||
return log_handler1_impl(true, LogTriStateTrue);
|
return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Enables logs at runtime.
|
// Enables logs at runtime.
|
||||||
|
@ -419,19 +436,31 @@ inline FILE *log_disable_impl()
|
||||||
// INTERNAL, DO NOT USE
|
// INTERNAL, DO NOT USE
|
||||||
inline FILE *log_enable_impl()
|
inline FILE *log_enable_impl()
|
||||||
{
|
{
|
||||||
return log_handler1_impl(true, LogTriStateFalse);
|
return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
|
// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
|
||||||
#define log_set_target(target) log_set_target_impl(target)
|
#define log_set_target(target) log_set_target_impl(target)
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
// INTERNAL, DO NOT USE
|
||||||
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); }
|
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
|
||||||
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); }
|
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
// INTERNAL, DO NOT USE
|
||||||
inline FILE *log_handler() { return log_handler1_impl(); }
|
inline FILE *log_handler() { return log_handler1_impl(); }
|
||||||
|
|
||||||
|
// Enable or disable creating separate log files for each run.
|
||||||
|
// can ONLY be invoked BEFORE first log use.
|
||||||
|
#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
|
||||||
|
// Enable or disable append mode for log file.
|
||||||
|
// can ONLY be invoked BEFORE first log use.
|
||||||
|
#define log_append(enable) log_append_impl(enable)
|
||||||
|
// INTERNAL, DO NOT USE
|
||||||
|
inline FILE *log_append_impl(bool enable)
|
||||||
|
{
|
||||||
|
return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
|
||||||
|
}
|
||||||
|
|
||||||
inline void log_test()
|
inline void log_test()
|
||||||
{
|
{
|
||||||
log_disable();
|
log_disable();
|
||||||
|
@ -493,6 +522,18 @@ inline bool log_param_single_parse(const std::string & param)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (param == "--log-new")
|
||||||
|
{
|
||||||
|
log_multilog(true);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (param == "--log-append")
|
||||||
|
{
|
||||||
|
log_append(true);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -522,7 +563,9 @@ inline void log_print_usage()
|
||||||
printf(" --log-disable Disable trace logs\n");
|
printf(" --log-disable Disable trace logs\n");
|
||||||
printf(" --log-enable Enable trace logs\n");
|
printf(" --log-enable Enable trace logs\n");
|
||||||
printf(" --log-file Specify a log filename (without extension)\n");
|
printf(" --log-file Specify a log filename (without extension)\n");
|
||||||
printf(" Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */
|
printf(" --log-new Create a separate new log file on start. "
|
||||||
|
"Each log file will have unique name: \"<name>.<ID>.log\"\n");
|
||||||
|
printf(" --log-append Don't truncate the old log file.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
|
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
|
||||||
|
|
|
@ -39,6 +39,7 @@ void llama_sampling_free(struct llama_sampling_context * ctx) {
|
||||||
void llama_sampling_reset(llama_sampling_context * ctx) {
|
void llama_sampling_reset(llama_sampling_context * ctx) {
|
||||||
if (ctx->grammar != NULL) {
|
if (ctx->grammar != NULL) {
|
||||||
llama_grammar_free(ctx->grammar);
|
llama_grammar_free(ctx->grammar);
|
||||||
|
ctx->grammar = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!ctx->parsed_grammar.rules.empty()) {
|
if (!ctx->parsed_grammar.rules.empty()) {
|
||||||
|
@ -89,10 +90,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
|
||||||
|
|
||||||
snprintf(result, sizeof(result),
|
snprintf(result, sizeof(result),
|
||||||
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
||||||
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
||||||
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
||||||
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
|
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
|
||||||
params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp,
|
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
|
||||||
params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
||||||
|
|
||||||
return std::string(result);
|
return std::string(result);
|
||||||
|
@ -110,6 +111,7 @@ llama_token llama_sampling_sample(
|
||||||
const float temp = params.temp;
|
const float temp = params.temp;
|
||||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||||
const float top_p = params.top_p;
|
const float top_p = params.top_p;
|
||||||
|
const float min_p = params.min_p;
|
||||||
const float tfs_z = params.tfs_z;
|
const float tfs_z = params.tfs_z;
|
||||||
const float typical_p = params.typical_p;
|
const float typical_p = params.typical_p;
|
||||||
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
|
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
|
||||||
|
@ -167,8 +169,12 @@ llama_token llama_sampling_sample(
|
||||||
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
|
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (temp <= 0) {
|
if (temp < 0.0) {
|
||||||
// greedy sampling
|
// greedy sampling, with probs
|
||||||
|
llama_sample_softmax(ctx_main, &cur_p);
|
||||||
|
id = cur_p.data[0].id;
|
||||||
|
} else if (temp == 0.0) {
|
||||||
|
// greedy sampling, no probs
|
||||||
id = llama_sample_token_greedy(ctx_main, &cur_p);
|
id = llama_sample_token_greedy(ctx_main, &cur_p);
|
||||||
} else {
|
} else {
|
||||||
if (mirostat == 1) {
|
if (mirostat == 1) {
|
||||||
|
@ -186,6 +192,7 @@ llama_token llama_sampling_sample(
|
||||||
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
|
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
|
||||||
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
|
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
|
||||||
llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep);
|
llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep);
|
||||||
|
llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep);
|
||||||
llama_sample_temp (ctx_main, &cur_p, temp);
|
llama_sample_temp (ctx_main, &cur_p, temp);
|
||||||
|
|
||||||
id = llama_sample_token(ctx_main, &cur_p);
|
id = llama_sample_token(ctx_main, &cur_p);
|
||||||
|
|
|
@ -14,6 +14,7 @@ typedef struct llama_sampling_params {
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
int32_t top_k = 40; // <= 0 to use vocab size
|
int32_t top_k = 40; // <= 0 to use vocab size
|
||||||
float top_p = 0.95f; // 1.0 = disabled
|
float top_p = 0.95f; // 1.0 = disabled
|
||||||
|
float min_p = 0.05f; // 0.0 = disabled
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
float typical_p = 1.00f; // 1.0 = disabled
|
float typical_p = 1.00f; // 1.0 = disabled
|
||||||
float temp = 0.80f; // 1.0 = disabled
|
float temp = 0.80f; // 1.0 = disabled
|
||||||
|
|
|
@ -1045,6 +1045,7 @@ struct train_params_common get_default_train_params_common() {
|
||||||
params.n_batch = 8;
|
params.n_batch = 8;
|
||||||
params.n_gradient_accumulation = 1;
|
params.n_gradient_accumulation = 1;
|
||||||
params.n_epochs = -1;
|
params.n_epochs = -1;
|
||||||
|
params.n_gpu_layers = 0;
|
||||||
|
|
||||||
params.custom_n_ctx = false;
|
params.custom_n_ctx = false;
|
||||||
|
|
||||||
|
@ -1080,6 +1081,7 @@ struct train_params_common get_default_train_params_common() {
|
||||||
params.adam_beta2 = 0.999f;
|
params.adam_beta2 = 0.999f;
|
||||||
params.adam_gclip = 1.0f;
|
params.adam_gclip = 1.0f;
|
||||||
params.adam_eps_f = 0.0f;
|
params.adam_eps_f = 0.0f;
|
||||||
|
|
||||||
return params;
|
return params;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -44,6 +44,7 @@ struct train_params_common {
|
||||||
int n_batch;
|
int n_batch;
|
||||||
int n_gradient_accumulation;
|
int n_gradient_accumulation;
|
||||||
int n_epochs;
|
int n_epochs;
|
||||||
|
int n_gpu_layers;
|
||||||
|
|
||||||
bool custom_n_ctx;
|
bool custom_n_ctx;
|
||||||
|
|
||||||
|
|
|
@ -163,7 +163,8 @@ gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
||||||
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
|
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
|
||||||
if "type" in hparams["rope_scaling"]:
|
if "type" in hparams["rope_scaling"]:
|
||||||
if hparams["rope_scaling"]["type"] == "linear":
|
if hparams["rope_scaling"]["type"] == "linear":
|
||||||
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
|
gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
|
gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
||||||
|
|
||||||
|
|
||||||
# TOKENIZATION
|
# TOKENIZATION
|
||||||
|
|
110
convert.py
110
convert.py
|
@ -151,8 +151,11 @@ class Params:
|
||||||
n_head_kv: int
|
n_head_kv: int
|
||||||
f_norm_eps: float
|
f_norm_eps: float
|
||||||
|
|
||||||
|
rope_scaling_type: gguf.RopeScalingType | None = None
|
||||||
f_rope_freq_base: float | None = None
|
f_rope_freq_base: float | None = None
|
||||||
f_rope_scale: float | None = None
|
f_rope_scale: float | None = None
|
||||||
|
n_orig_ctx: int | None = None
|
||||||
|
rope_finetuned: bool | None = None
|
||||||
|
|
||||||
ftype: GGMLFileType | None = None
|
ftype: GGMLFileType | None = None
|
||||||
|
|
||||||
|
@ -198,20 +201,20 @@ class Params:
|
||||||
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
|
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
|
||||||
config = json.load(open(config_path))
|
config = json.load(open(config_path))
|
||||||
|
|
||||||
n_vocab = config["vocab_size"]
|
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
|
||||||
n_embd = config["hidden_size"]
|
|
||||||
n_layer = config["num_hidden_layers"]
|
|
||||||
n_ff = config["intermediate_size"]
|
|
||||||
n_head = config["num_attention_heads"]
|
|
||||||
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
|
|
||||||
f_norm_eps = config["rms_norm_eps"]
|
|
||||||
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
|
|
||||||
|
|
||||||
rope_scaling = config.get("rope_scaling")
|
rope_scaling = config.get("rope_scaling")
|
||||||
if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear":
|
|
||||||
f_rope_scale = config["rope_scaling"].get("factor")
|
if rope_scaling is not None and (typ := rope_scaling.get("type")):
|
||||||
|
rope_factor = rope_scaling.get("factor")
|
||||||
|
f_rope_scale = rope_factor
|
||||||
|
if typ == "linear":
|
||||||
|
rope_scaling_type = gguf.RopeScalingType.LINEAR
|
||||||
|
elif typ == "yarn":
|
||||||
|
rope_scaling_type = gguf.RopeScalingType.YARN
|
||||||
|
n_orig_ctx = rope_scaling['original_max_position_embeddings']
|
||||||
|
rope_finetuned = rope_scaling['finetuned']
|
||||||
else:
|
else:
|
||||||
f_rope_scale = None
|
raise NotImplementedError(f'Unknown rope scaling type: {typ}')
|
||||||
|
|
||||||
if "max_sequence_length" in config:
|
if "max_sequence_length" in config:
|
||||||
n_ctx = config["max_sequence_length"]
|
n_ctx = config["max_sequence_length"]
|
||||||
|
@ -222,16 +225,19 @@ class Params:
|
||||||
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab = n_vocab,
|
n_vocab = config["vocab_size"],
|
||||||
n_embd = n_embd,
|
n_embd = config["hidden_size"],
|
||||||
n_layer = n_layer,
|
n_layer = config["num_hidden_layers"],
|
||||||
n_ctx = n_ctx,
|
n_ctx = n_ctx,
|
||||||
n_ff = n_ff,
|
n_ff = config["intermediate_size"],
|
||||||
n_head = n_head,
|
n_head = (n_head := config["num_attention_heads"]),
|
||||||
n_head_kv = n_head_kv,
|
n_head_kv = config.get("num_key_value_heads", n_head),
|
||||||
f_norm_eps = f_norm_eps,
|
f_norm_eps = config["rms_norm_eps"],
|
||||||
f_rope_freq_base = f_rope_freq_base,
|
f_rope_freq_base = config.get("rope_theta"),
|
||||||
|
rope_scaling_type = rope_scaling_type,
|
||||||
f_rope_scale = f_rope_scale,
|
f_rope_scale = f_rope_scale,
|
||||||
|
n_orig_ctx = n_orig_ctx,
|
||||||
|
rope_finetuned = rope_finetuned,
|
||||||
)
|
)
|
||||||
|
|
||||||
# LLaMA v2 70B params.json
|
# LLaMA v2 70B params.json
|
||||||
|
@ -240,17 +246,8 @@ class Params:
|
||||||
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
|
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
|
||||||
config = json.load(open(config_path))
|
config = json.load(open(config_path))
|
||||||
|
|
||||||
n_vocab = config["vocab_size"] if "vocab_size" in config else -1
|
|
||||||
n_embd = config["dim"]
|
|
||||||
n_layer = config["n_layers"]
|
|
||||||
n_ff = -1
|
|
||||||
n_head = config["n_heads"]
|
|
||||||
n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
|
|
||||||
f_norm_eps = config["norm_eps"]
|
|
||||||
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
|
|
||||||
|
|
||||||
# hack to determine LLaMA v1 vs v2 vs CodeLlama
|
# hack to determine LLaMA v1 vs v2 vs CodeLlama
|
||||||
if f_rope_freq_base == 1000000:
|
if config.get("rope_theta") == 1000000:
|
||||||
# CodeLlama
|
# CodeLlama
|
||||||
n_ctx = 16384
|
n_ctx = 16384
|
||||||
elif config["norm_eps"] == 1e-05:
|
elif config["norm_eps"] == 1e-05:
|
||||||
|
@ -260,22 +257,16 @@ class Params:
|
||||||
# LLaMA v1
|
# LLaMA v1
|
||||||
n_ctx = 2048
|
n_ctx = 2048
|
||||||
|
|
||||||
if n_vocab == -1:
|
|
||||||
n_vocab = model["tok_embeddings.weight"].shape[0]
|
|
||||||
|
|
||||||
if n_ff == -1:
|
|
||||||
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
|
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab = n_vocab,
|
n_vocab = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
|
||||||
n_embd = n_embd,
|
n_embd = config["dim"],
|
||||||
n_layer = n_layer,
|
n_layer = config["n_layers"],
|
||||||
n_ctx = n_ctx,
|
n_ctx = n_ctx,
|
||||||
n_ff = n_ff,
|
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0],
|
||||||
n_head = n_head,
|
n_head = (n_head := config["n_heads"]),
|
||||||
n_head_kv = n_head_kv,
|
n_head_kv = config.get("n_kv_heads", n_head),
|
||||||
f_norm_eps = f_norm_eps,
|
f_norm_eps = config["norm_eps"],
|
||||||
f_rope_freq_base = f_rope_freq_base,
|
f_rope_freq_base = config.get("rope_theta"),
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -366,15 +357,18 @@ class SentencePieceVocab:
|
||||||
added_tokens = {}
|
added_tokens = {}
|
||||||
|
|
||||||
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
||||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
|
||||||
actual_ids = sorted(added_tokens.values())
|
|
||||||
if expected_ids != actual_ids:
|
|
||||||
raise Exception(f"Expected added token IDs to be sequential and start at {vocab_size}; got {actual_ids}")
|
|
||||||
|
|
||||||
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
||||||
self.added_tokens_list = [text for (text, idx) in items]
|
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
||||||
self.vocab_size_base: int = vocab_size
|
actual_new_ids = sorted(new_tokens.keys())
|
||||||
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
|
||||||
|
if expected_new_ids != actual_new_ids:
|
||||||
|
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
||||||
|
|
||||||
|
# Token pieces that were added to the base vocabulary.
|
||||||
|
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
||||||
|
self.vocab_size_base = vocab_size
|
||||||
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
self.fname_tokenizer = fname_tokenizer
|
self.fname_tokenizer = fname_tokenizer
|
||||||
self.fname_added_tokens = fname_added_tokens
|
self.fname_added_tokens = fname_added_tokens
|
||||||
|
|
||||||
|
@ -828,8 +822,16 @@ class OutputFile:
|
||||||
if params.f_rope_freq_base is not None:
|
if params.f_rope_freq_base is not None:
|
||||||
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
|
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
|
||||||
|
|
||||||
if params.f_rope_scale is not None:
|
if params.rope_scaling_type:
|
||||||
self.gguf.add_rope_scale_linear(params.f_rope_scale)
|
assert params.f_rope_scale is not None
|
||||||
|
self.gguf.add_rope_scaling_type(params.rope_scaling_type)
|
||||||
|
self.gguf.add_rope_scaling_factor(params.f_rope_scale)
|
||||||
|
|
||||||
|
if params.n_orig_ctx is not None:
|
||||||
|
self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
|
||||||
|
|
||||||
|
if params.rope_finetuned is not None:
|
||||||
|
self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
|
||||||
|
|
||||||
if params.ftype is not None:
|
if params.ftype is not None:
|
||||||
self.gguf.add_file_type(params.ftype)
|
self.gguf.add_file_type(params.ftype)
|
||||||
|
|
|
@ -154,6 +154,10 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG_TEE("\n");
|
||||||
|
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
|
||||||
|
LOG_TEE("\n");
|
||||||
|
|
||||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||||
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
||||||
|
|
||||||
|
@ -181,7 +185,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const auto t_pp_start = ggml_time_us();
|
const auto t_pp_start = ggml_time_us();
|
||||||
|
|
||||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||||
|
|
|
@ -11,7 +11,7 @@ int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (argc == 1 || argv[1][0] == '-') {
|
if (argc == 1 || argv[1][0] == '-') {
|
||||||
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN]\n" , argv[0]);
|
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
|
||||||
return 1 ;
|
return 1 ;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -21,6 +21,9 @@ int main(int argc, char ** argv) {
|
||||||
// total length of the sequences including the prompt
|
// total length of the sequences including the prompt
|
||||||
int n_len = 32;
|
int n_len = 32;
|
||||||
|
|
||||||
|
// number of layers to offload to the GPU
|
||||||
|
int n_gpu_layers = 0;
|
||||||
|
|
||||||
if (argc >= 2) {
|
if (argc >= 2) {
|
||||||
params.model = argv[1];
|
params.model = argv[1];
|
||||||
}
|
}
|
||||||
|
@ -37,6 +40,10 @@ int main(int argc, char ** argv) {
|
||||||
n_len = std::atoi(argv[4]);
|
n_len = std::atoi(argv[4]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (argc >= 6) {
|
||||||
|
n_gpu_layers = std::atoi(argv[5]);
|
||||||
|
}
|
||||||
|
|
||||||
if (params.prompt.empty()) {
|
if (params.prompt.empty()) {
|
||||||
params.prompt = "Hello my name is";
|
params.prompt = "Hello my name is";
|
||||||
}
|
}
|
||||||
|
@ -49,7 +56,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
|
|
||||||
// model_params.n_gpu_layers = 99; // offload all layers to the GPU
|
model_params.n_gpu_layers = n_gpu_layers;
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,6 @@
|
||||||
set(TARGET benchmark)
|
set(TARGET benchmark)
|
||||||
add_executable(${TARGET} benchmark-matmult.cpp)
|
add_executable(${TARGET} benchmark-matmult.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
#include "build-info.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,3 @@ add_executable(${TARGET} embedding.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
#include "build-info.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
|
|
@ -642,8 +642,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||||
const int rope_mode = 0;
|
const int rope_mode = 0;
|
||||||
|
|
||||||
return ggml_rope_custom(ctx,
|
return ggml_rope_custom(ctx,
|
||||||
t, KQ_pos, n_rot, rope_mode, n_ctx,
|
t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
|
||||||
rope_freq_base, rope_freq_scale);
|
rope_freq_base, rope_freq_scale, 0.0f, 0.0f, 0.0f, 0.0f
|
||||||
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
set_name(tokens_input, "tokens_input");
|
set_name(tokens_input, "tokens_input");
|
||||||
|
@ -652,7 +653,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||||
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
|
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
|
||||||
|
|
||||||
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
|
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
|
||||||
if (ggml_is_quantized(a->type)) {
|
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
|
||||||
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
|
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
|
||||||
} else if (a->type == GGML_TYPE_F32) {
|
} else if (a->type == GGML_TYPE_F32) {
|
||||||
return ggml_add(ctx, a, b);
|
return ggml_add(ctx, a, b);
|
||||||
|
@ -1459,6 +1460,17 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
|
||||||
}
|
}
|
||||||
params->n_rank_w3 = std::stoi(argv[i]);
|
params->n_rank_w3 = std::stoi(argv[i]);
|
||||||
params->custom_n_rank_w3 = true;
|
params->custom_n_rank_w3 = true;
|
||||||
|
} else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
|
params->common.n_gpu_layers = std::stoi(argv[i]);
|
||||||
|
#else
|
||||||
|
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||||
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||||
|
#endif
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||||
train_print_usage(argc, argv, &default_params);
|
train_print_usage(argc, argv, &default_params);
|
||||||
|
@ -1545,6 +1557,7 @@ int main(int argc, char ** argv) {
|
||||||
srand(params.common.seed);
|
srand(params.common.seed);
|
||||||
|
|
||||||
struct llama_model_params llama_mparams = llama_model_default_params();
|
struct llama_model_params llama_mparams = llama_model_default_params();
|
||||||
|
llama_mparams.n_gpu_layers = params.common.n_gpu_layers;
|
||||||
llama_mparams.vocab_only = false;
|
llama_mparams.vocab_only = false;
|
||||||
|
|
||||||
printf("%s: model base = '%s'\n", __func__, params.fn_model_base);
|
printf("%s: model base = '%s'\n", __func__, params.fn_model_base);
|
||||||
|
|
34
examples/finetune/finetune.sh
Normal file
34
examples/finetune/finetune.sh
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
#!/bin/bash
|
||||||
|
cd `dirname $0`
|
||||||
|
cd ../..
|
||||||
|
|
||||||
|
EXE="./finetune"
|
||||||
|
|
||||||
|
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
|
||||||
|
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
|
||||||
|
|
||||||
|
# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
|
||||||
|
MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
|
||||||
|
|
||||||
|
while getopts "dg" opt; do
|
||||||
|
case $opt in
|
||||||
|
d)
|
||||||
|
DEBUGGER="gdb --args"
|
||||||
|
;;
|
||||||
|
g)
|
||||||
|
EXE="./build/bin/Release/finetune"
|
||||||
|
GPUARG="--gpu-layers 25"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
$DEBUGGER $EXE \
|
||||||
|
--model-base $MODEL \
|
||||||
|
$GPUARG \
|
||||||
|
--checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \
|
||||||
|
--checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
|
||||||
|
--lora-out lora-ol3b-shakespeare-ITERATION.bin \
|
||||||
|
--train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
|
||||||
|
--save-every 10 \
|
||||||
|
--threads 10 --adam-iter 30 --batch 4 --ctx 64 \
|
||||||
|
--use-checkpointing
|
|
@ -3,6 +3,3 @@ add_executable(${TARGET} infill.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
|
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "build-info.h"
|
|
||||||
#include "grammar-parser.h"
|
#include "grammar-parser.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
@ -184,8 +183,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
||||||
LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
|
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||||
params.seed = time(NULL);
|
params.seed = time(NULL);
|
||||||
|
|
|
@ -3,6 +3,3 @@ add_executable(${TARGET} llama-bench.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
|
|
@ -19,7 +19,6 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "build-info.h"
|
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
|
|
||||||
// utils
|
// utils
|
||||||
|
@ -641,8 +640,8 @@ struct test {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const std::string test::build_commit = BUILD_COMMIT;
|
const std::string test::build_commit = LLAMA_COMMIT;
|
||||||
const int test::build_number = BUILD_NUMBER;
|
const int test::build_number = LLAMA_BUILD_NUMBER;
|
||||||
const bool test::cuda = !!ggml_cpu_has_cublas();
|
const bool test::cuda = !!ggml_cpu_has_cublas();
|
||||||
const bool test::opencl = !!ggml_cpu_has_clblast();
|
const bool test::opencl = !!ggml_cpu_has_clblast();
|
||||||
const bool test::metal = !!ggml_cpu_has_metal();
|
const bool test::metal = !!ggml_cpu_has_metal();
|
||||||
|
@ -1037,7 +1036,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
test t(inst, lmodel, ctx);
|
test t(inst, lmodel, ctx);
|
||||||
|
|
||||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
// warmup run
|
// warmup run
|
||||||
if (t.n_prompt > 0) {
|
if (t.n_prompt > 0) {
|
||||||
|
@ -1048,7 +1047,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < params.reps; i++) {
|
for (int i = 0; i < params.reps; i++) {
|
||||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
uint64_t t_start = get_time_ns();
|
uint64_t t_start = get_time_ns();
|
||||||
if (t.n_prompt > 0) {
|
if (t.n_prompt > 0) {
|
||||||
|
|
|
@ -6,15 +6,9 @@ target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
|
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
|
||||||
endif()
|
endif()
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
set(TARGET llava)
|
set(TARGET llava)
|
||||||
add_executable(${TARGET} llava.cpp)
|
add_executable(${TARGET} llava.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
|
|
@ -16,6 +16,8 @@ add_library(common OBJECT
|
||||||
${_common_path}/console.cpp
|
${_common_path}/console.cpp
|
||||||
${_common_path}/grammar-parser.h
|
${_common_path}/grammar-parser.h
|
||||||
${_common_path}/grammar-parser.cpp
|
${_common_path}/grammar-parser.cpp
|
||||||
|
${_common_path}/sampling.h
|
||||||
|
${_common_path}/sampling.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
# WARNING: because build-info.h is auto-generated, it will only
|
# WARNING: because build-info.h is auto-generated, it will only
|
||||||
|
|
|
@ -3,6 +3,3 @@ add_executable(${TARGET} main.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
|
|
@ -208,6 +208,14 @@ Top-p sampling, also known as nucleus sampling, is another text generation metho
|
||||||
|
|
||||||
Example usage: `--top-p 0.95`
|
Example usage: `--top-p 0.95`
|
||||||
|
|
||||||
|
### Min P Sampling
|
||||||
|
|
||||||
|
- `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.05).
|
||||||
|
|
||||||
|
The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.
|
||||||
|
|
||||||
|
Example usage: `--min-p 0.05`
|
||||||
|
|
||||||
### Tail Free Sampling (TFS)
|
### Tail Free Sampling (TFS)
|
||||||
|
|
||||||
- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
|
- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
|
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "build-info.h"
|
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
@ -153,8 +152,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
||||||
LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
|
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||||
params.seed = time(NULL);
|
params.seed = time(NULL);
|
||||||
|
@ -298,7 +297,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// remove any "future" tokens that we might have inherited from the previous session
|
// remove any "future" tokens that we might have inherited from the previous session
|
||||||
llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
|
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOGLN(
|
LOGLN(
|
||||||
|
|
|
@ -3,6 +3,3 @@ add_executable(${TARGET} parallel.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
// A basic application simulating a server with multiple clients.
|
// A basic application simulating a server with multiple clients.
|
||||||
// The clients submite requests to the server and they are processed in parallel.
|
// The clients submite requests to the server and they are processed in parallel.
|
||||||
|
|
||||||
#include "build-info.h"
|
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,3 @@ add_executable(${TARGET} perplexity.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
#include "build-info.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
@ -210,7 +209,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
// clear the KV cache
|
// clear the KV cache
|
||||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
for (int j = 0; j < num_batches; ++j) {
|
for (int j = 0; j < num_batches; ++j) {
|
||||||
const int batch_start = start + j * n_batch;
|
const int batch_start = start + j * n_batch;
|
||||||
|
@ -339,7 +338,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
// clear the KV cache
|
// clear the KV cache
|
||||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
for (int j = 0; j < num_batches; ++j) {
|
for (int j = 0; j < num_batches; ++j) {
|
||||||
const int batch_start = start + j * n_batch;
|
const int batch_start = start + j * n_batch;
|
||||||
|
@ -573,7 +572,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// clear the KV cache
|
// clear the KV cache
|
||||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
|
auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
|
||||||
if (logits.empty()) {
|
if (logits.empty()) {
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
set(TARGET quantize-stats)
|
set(TARGET quantize-stats)
|
||||||
add_executable(${TARGET} quantize-stats.cpp)
|
add_executable(${TARGET} quantize-stats.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
#define LLAMA_API_INTERNAL
|
#define LLAMA_API_INTERNAL
|
||||||
#include "build-info.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
|
@ -1,9 +1,6 @@
|
||||||
set(TARGET quantize)
|
set(TARGET quantize)
|
||||||
add_executable(${TARGET} quantize.cpp)
|
add_executable(${TARGET} quantize.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
#include "build-info.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
@ -18,7 +17,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
|
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
|
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
|
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
|
||||||
#ifdef GGML_USE_K_QUANTS
|
|
||||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
||||||
|
@ -31,7 +29,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
|
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
|
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
|
||||||
#endif
|
|
||||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
||||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||||
|
@ -70,13 +67,14 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
||||||
}
|
}
|
||||||
|
|
||||||
// usage:
|
// usage:
|
||||||
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
// ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
||||||
//
|
//
|
||||||
[[noreturn]]
|
[[noreturn]]
|
||||||
static void usage(const char * executable) {
|
static void usage(const char * executable) {
|
||||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||||
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||||
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||||
|
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
|
||||||
printf("\nAllowed quantization types:\n");
|
printf("\nAllowed quantization types:\n");
|
||||||
for (auto & it : QUANT_OPTIONS) {
|
for (auto & it : QUANT_OPTIONS) {
|
||||||
if (it.name != "COPY") {
|
if (it.name != "COPY") {
|
||||||
|
@ -103,6 +101,8 @@ int main(int argc, char ** argv) {
|
||||||
params.quantize_output_tensor = false;
|
params.quantize_output_tensor = false;
|
||||||
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
||||||
params.allow_requantize = true;
|
params.allow_requantize = true;
|
||||||
|
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
|
||||||
|
params.pure = true;
|
||||||
} else {
|
} else {
|
||||||
usage(argv[0]);
|
usage(argv[0]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,3 @@ add_executable(${TARGET} save-load-state.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
#include "build-info.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,3 @@ if (WIN32)
|
||||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||||
endif()
|
endif()
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "build-info.h"
|
|
||||||
#include "grammar-parser.h"
|
#include "grammar-parser.h"
|
||||||
|
|
||||||
#include "../llava/clip.h"
|
#include "../llava/clip.h"
|
||||||
|
@ -149,6 +148,7 @@ struct task_server {
|
||||||
task_type type;
|
task_type type;
|
||||||
json data;
|
json data;
|
||||||
bool infill_mode = false;
|
bool infill_mode = false;
|
||||||
|
bool embedding_mode = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct task_result {
|
struct task_result {
|
||||||
|
@ -371,6 +371,7 @@ struct llama_client_slot
|
||||||
std::vector<completion_token_output> generated_token_probs;
|
std::vector<completion_token_output> generated_token_probs;
|
||||||
|
|
||||||
bool infill = false;
|
bool infill = false;
|
||||||
|
bool embedding = false;
|
||||||
bool has_next_token = true;
|
bool has_next_token = true;
|
||||||
bool truncated = false;
|
bool truncated = false;
|
||||||
bool stopped_eos = false;
|
bool stopped_eos = false;
|
||||||
|
@ -454,7 +455,7 @@ struct llama_client_slot
|
||||||
}
|
}
|
||||||
|
|
||||||
void release() {
|
void release() {
|
||||||
if (state == PROCESSING)
|
if (state == IDLE || state == PROCESSING)
|
||||||
{
|
{
|
||||||
t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
|
t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
|
||||||
command = RELEASE;
|
command = RELEASE;
|
||||||
|
@ -754,6 +755,7 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
|
|
||||||
slot->params.antiprompt.clear();
|
slot->params.antiprompt.clear();
|
||||||
|
|
||||||
const auto &stop = data.find("stop");
|
const auto &stop = data.find("stop");
|
||||||
if (stop != data.end() && stop->is_array())
|
if (stop != data.end() && stop->is_array())
|
||||||
{
|
{
|
||||||
|
@ -856,7 +858,7 @@ struct llama_server_context
|
||||||
|
|
||||||
void kv_cache_clear() {
|
void kv_cache_clear() {
|
||||||
// clear the entire KV cache
|
// clear the entire KV cache
|
||||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
llama_kv_cache_clear(ctx);
|
||||||
clean_kv_cache = false;
|
clean_kv_cache = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -867,7 +869,7 @@ struct llama_server_context
|
||||||
|
|
||||||
kv_cache_clear();
|
kv_cache_clear();
|
||||||
|
|
||||||
for (int32_t i = 0; i < batch.n_tokens; ++i)
|
for (int i = 0; i < (int) system_tokens.size(); ++i)
|
||||||
{
|
{
|
||||||
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||||
}
|
}
|
||||||
|
@ -894,16 +896,8 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
slot.release();
|
slot.release();
|
||||||
}
|
}
|
||||||
wait_all_are_idle();
|
|
||||||
all_slots_are_idle = true;
|
|
||||||
|
|
||||||
// wait until system prompt load
|
|
||||||
system_need_update = true;
|
system_need_update = true;
|
||||||
while (system_need_update)
|
|
||||||
{
|
|
||||||
std::this_thread::sleep_for(std::chrono::milliseconds(5));
|
|
||||||
}
|
|
||||||
// system prompt loaded, continue
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void process_system_prompt_data(const json &sys_props) {
|
void process_system_prompt_data(const json &sys_props) {
|
||||||
|
@ -915,26 +909,6 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
notify_system_prompt_changed();
|
notify_system_prompt_changed();
|
||||||
}
|
}
|
||||||
else
|
|
||||||
{
|
|
||||||
system_need_update = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void wait_all_are_idle() {
|
|
||||||
bool wait = true;
|
|
||||||
while (wait)
|
|
||||||
{
|
|
||||||
wait = false;
|
|
||||||
for (auto &slot : slots)
|
|
||||||
{
|
|
||||||
if (!slot.available())
|
|
||||||
{
|
|
||||||
wait = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
|
static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
|
||||||
|
@ -965,7 +939,6 @@ struct llama_server_context
|
||||||
slot.has_next_token = false;
|
slot.has_next_token = false;
|
||||||
}
|
}
|
||||||
stop_pos = pos;
|
stop_pos = pos;
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1272,13 +1245,14 @@ struct llama_server_context
|
||||||
queue_results.push_back(res);
|
queue_results.push_back(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
int request_completion(json data, bool infill)
|
int request_completion(json data, bool infill, bool embedding)
|
||||||
{
|
{
|
||||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||||
task_server task;
|
task_server task;
|
||||||
task.id = id_gen++;
|
task.id = id_gen++;
|
||||||
task.data = data;
|
task.data = data;
|
||||||
task.infill_mode = infill;
|
task.infill_mode = infill;
|
||||||
|
task.embedding_mode = embedding;
|
||||||
task.type = COMPLETION_TASK;
|
task.type = COMPLETION_TASK;
|
||||||
queue_tasks.push_back(task);
|
queue_tasks.push_back(task);
|
||||||
return task.id;
|
return task.id;
|
||||||
|
@ -1404,7 +1378,7 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
LOG_TEE("slot unavailable\n");
|
LOG_TEE("slot unavailable\n");
|
||||||
// send error result
|
// send error result
|
||||||
send_error(task.id, "slot unavaliable");
|
send_error(task.id, "slot unavailable");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1416,6 +1390,7 @@ struct llama_server_context
|
||||||
slot->reset();
|
slot->reset();
|
||||||
|
|
||||||
slot->infill = task.infill_mode;
|
slot->infill = task.infill_mode;
|
||||||
|
slot->embedding = task.embedding_mode;
|
||||||
slot->task_id = task.id;
|
slot->task_id = task.id;
|
||||||
|
|
||||||
if (!launch_slot_with_data(slot, task.data))
|
if (!launch_slot_with_data(slot, task.data))
|
||||||
|
@ -1444,7 +1419,7 @@ struct llama_server_context
|
||||||
process_tasks();
|
process_tasks();
|
||||||
|
|
||||||
// update the system prompt wait until all slots are idle state
|
// update the system prompt wait until all slots are idle state
|
||||||
if (system_need_update)
|
if (system_need_update && all_slots_are_idle)
|
||||||
{
|
{
|
||||||
LOG_TEE("updating system prompt\n");
|
LOG_TEE("updating system prompt\n");
|
||||||
update_system_prompt();
|
update_system_prompt();
|
||||||
|
@ -1498,7 +1473,7 @@ struct llama_server_context
|
||||||
for (auto & slot : slots)
|
for (auto & slot : slots)
|
||||||
{
|
{
|
||||||
// release the slot
|
// release the slot
|
||||||
if (slot.state == PROCESSING && slot.command == RELEASE)
|
if (slot.command == RELEASE)
|
||||||
{
|
{
|
||||||
slot.state = IDLE;
|
slot.state = IDLE;
|
||||||
slot.command = NONE;
|
slot.command = NONE;
|
||||||
|
@ -1509,7 +1484,7 @@ struct llama_server_context
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slot.state == IDLE || slot.command == RELEASE)
|
if (slot.state == IDLE)
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -1530,6 +1505,17 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
for (auto & slot : slots)
|
for (auto & slot : slots)
|
||||||
{
|
{
|
||||||
|
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
|
||||||
|
|
||||||
|
// empty prompt passed -> release the slot and send empty response
|
||||||
|
if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
|
||||||
|
{
|
||||||
|
slot.release();
|
||||||
|
slot.print_timings();
|
||||||
|
send_final_response(slot);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// need process the prompt
|
// need process the prompt
|
||||||
if (slot.state == IDLE && slot.command == LOAD_PROMPT)
|
if (slot.state == IDLE && slot.command == LOAD_PROMPT)
|
||||||
{
|
{
|
||||||
|
@ -1712,7 +1698,7 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
|
|
||||||
// prompt evaluated for embedding
|
// prompt evaluated for embedding
|
||||||
if (params.embedding)
|
if (slot.embedding)
|
||||||
{
|
{
|
||||||
send_embedding(slot);
|
send_embedding(slot);
|
||||||
slot.release();
|
slot.release();
|
||||||
|
@ -1749,8 +1735,8 @@ struct llama_server_context
|
||||||
if (!process_token(result, slot))
|
if (!process_token(result, slot))
|
||||||
{
|
{
|
||||||
slot.release();
|
slot.release();
|
||||||
send_final_response(slot);
|
|
||||||
slot.print_timings();
|
slot.print_timings();
|
||||||
|
send_final_response(slot);
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.i_batch = -1;
|
slot.i_batch = -1;
|
||||||
|
@ -1769,9 +1755,16 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
printf(" -h, --help show this help message and exit\n");
|
printf(" -h, --help show this help message and exit\n");
|
||||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||||
|
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
|
printf(" --rope-scaling {none,linear,yarn}\n");
|
||||||
|
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
|
||||||
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
||||||
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
|
||||||
|
printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
|
||||||
|
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
|
||||||
|
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
|
||||||
|
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||||
|
@ -1893,6 +1886,19 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
}
|
}
|
||||||
params.n_ctx = std::stoi(argv[i]);
|
params.n_ctx = std::stoi(argv[i]);
|
||||||
}
|
}
|
||||||
|
else if (arg == "--rope-scaling")
|
||||||
|
{
|
||||||
|
if (++i >= argc)
|
||||||
|
{
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
std::string value(argv[i]);
|
||||||
|
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
|
||||||
|
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
|
||||||
|
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
|
||||||
|
else { invalid_param = true; break; }
|
||||||
|
}
|
||||||
else if (arg == "--rope-freq-base")
|
else if (arg == "--rope-freq-base")
|
||||||
{
|
{
|
||||||
if (++i >= argc)
|
if (++i >= argc)
|
||||||
|
@ -1911,6 +1917,38 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
}
|
}
|
||||||
params.rope_freq_scale = std::stof(argv[i]);
|
params.rope_freq_scale = std::stof(argv[i]);
|
||||||
}
|
}
|
||||||
|
else if (arg == "--yarn-ext-factor")
|
||||||
|
{
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.yarn_ext_factor = std::stof(argv[i]);
|
||||||
|
}
|
||||||
|
else if (arg == "--yarn-attn-factor")
|
||||||
|
{
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.yarn_attn_factor = std::stof(argv[i]);
|
||||||
|
}
|
||||||
|
else if (arg == "--yarn-beta-fast")
|
||||||
|
{
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.yarn_beta_fast = std::stof(argv[i]);
|
||||||
|
}
|
||||||
|
else if (arg == "--yarn-beta-slow")
|
||||||
|
{
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.yarn_beta_slow = std::stof(argv[i]);
|
||||||
|
}
|
||||||
else if (arg == "--memory-f32" || arg == "--memory_f32")
|
else if (arg == "--memory-f32" || arg == "--memory_f32")
|
||||||
{
|
{
|
||||||
params.memory_f16 = false;
|
params.memory_f16 = false;
|
||||||
|
@ -1924,6 +1962,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
}
|
}
|
||||||
params.n_threads = std::stoi(argv[i]);
|
params.n_threads = std::stoi(argv[i]);
|
||||||
}
|
}
|
||||||
|
else if (arg == "--threads-batch" || arg == "-tb")
|
||||||
|
{
|
||||||
|
if (++i >= argc)
|
||||||
|
{
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.n_threads_batch = std::stoi(argv[i]);
|
||||||
|
}
|
||||||
else if (arg == "-b" || arg == "--batch-size")
|
else if (arg == "-b" || arg == "--batch-size")
|
||||||
{
|
{
|
||||||
if (++i >= argc)
|
if (++i >= argc)
|
||||||
|
@ -2216,8 +2263,8 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
llama_backend_init(params.numa);
|
llama_backend_init(params.numa);
|
||||||
|
|
||||||
LOG_INFO("build info", {{"build", BUILD_NUMBER},
|
LOG_INFO("build info", {{"build", LLAMA_BUILD_NUMBER},
|
||||||
{"commit", BUILD_COMMIT}});
|
{"commit", LLAMA_COMMIT}});
|
||||||
|
|
||||||
LOG_INFO("system info", {
|
LOG_INFO("system info", {
|
||||||
{"n_threads", params.n_threads},
|
{"n_threads", params.n_threads},
|
||||||
|
@ -2281,7 +2328,7 @@ int main(int argc, char **argv)
|
||||||
svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
|
svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||||
{
|
{
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
const int task_id = llama.request_completion(data, false);
|
const int task_id = llama.request_completion(data, false, false);
|
||||||
if (!json_value(data, "stream", false)) {
|
if (!json_value(data, "stream", false)) {
|
||||||
std::string completion_text;
|
std::string completion_text;
|
||||||
task_result result = llama.next_result(task_id);
|
task_result result = llama.next_result(task_id);
|
||||||
|
@ -2336,7 +2383,7 @@ int main(int argc, char **argv)
|
||||||
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
|
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||||
{
|
{
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
const int task_id = llama.request_completion(data, true);
|
const int task_id = llama.request_completion(data, true, false);
|
||||||
if (!json_value(data, "stream", false)) {
|
if (!json_value(data, "stream", false)) {
|
||||||
std::string completion_text;
|
std::string completion_text;
|
||||||
task_result result = llama.next_result(task_id);
|
task_result result = llama.next_result(task_id);
|
||||||
|
@ -2440,7 +2487,7 @@ int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
prompt = "";
|
prompt = "";
|
||||||
}
|
}
|
||||||
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false);
|
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true);
|
||||||
task_result result = llama.next_result(task_id);
|
task_result result = llama.next_result(task_id);
|
||||||
return res.set_content(result.result_json.dump(), "application/json");
|
return res.set_content(result.result_json.dump(), "application/json");
|
||||||
});
|
});
|
||||||
|
|
|
@ -95,13 +95,8 @@ int main(int argc, char ** argv) {
|
||||||
llama_batch batch = llama_batch_init(512, 0, 1);
|
llama_batch batch = llama_batch_init(512, 0, 1);
|
||||||
|
|
||||||
// evaluate the initial prompt
|
// evaluate the initial prompt
|
||||||
batch.n_tokens = tokens_list.size();
|
for (size_t i = 0; i < tokens_list.size(); i++) {
|
||||||
|
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
|
||||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
|
||||||
batch.token[i] = tokens_list[i];
|
|
||||||
batch.pos[i] = i;
|
|
||||||
batch.seq_id[i] = 0;
|
|
||||||
batch.logits[i] = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// llama_decode will output logits only for the last token of the prompt
|
// llama_decode will output logits only for the last token of the prompt
|
||||||
|
@ -148,15 +143,10 @@ int main(int argc, char ** argv) {
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
// prepare the next batch
|
// prepare the next batch
|
||||||
batch.n_tokens = 0;
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
// push this new token for next evaluation
|
// push this new token for next evaluation
|
||||||
batch.token [batch.n_tokens] = new_token_id;
|
llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
|
||||||
batch.pos [batch.n_tokens] = n_cur;
|
|
||||||
batch.seq_id[batch.n_tokens] = 0;
|
|
||||||
batch.logits[batch.n_tokens] = true;
|
|
||||||
|
|
||||||
batch.n_tokens += 1;
|
|
||||||
|
|
||||||
n_decode += 1;
|
n_decode += 1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,3 @@ add_executable(${TARGET} speculative.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
#include "build-info.h"
|
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
@ -8,6 +6,9 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
||||||
|
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||||
|
|
||||||
struct seq_draft {
|
struct seq_draft {
|
||||||
bool active = false;
|
bool active = false;
|
||||||
bool drafting = false;
|
bool drafting = false;
|
||||||
|
@ -36,9 +37,11 @@ int main(int argc, char ** argv) {
|
||||||
// max number of parallel drafting sequences (i.e. tree branches)
|
// max number of parallel drafting sequences (i.e. tree branches)
|
||||||
const int n_seq_dft = params.n_parallel;
|
const int n_seq_dft = params.n_parallel;
|
||||||
|
|
||||||
// TODO: make this configurable
|
// probability threshold for accepting a token from the draft model
|
||||||
const float p_accept = 0.80f;
|
const float p_accept = params.p_accept;
|
||||||
const float p_split = 0.10f;
|
|
||||||
|
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
|
||||||
|
const float p_split = params.p_split;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
log_set_target(log_filename_generator("speculative", "log"));
|
log_set_target(log_filename_generator("speculative", "log"));
|
||||||
|
@ -64,6 +67,33 @@ int main(int argc, char ** argv) {
|
||||||
params.n_gpu_layers = params.n_gpu_layers_draft;
|
params.n_gpu_layers = params.n_gpu_layers_draft;
|
||||||
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
{
|
||||||
|
const int n_vocab_tgt = llama_n_vocab(model_tgt);
|
||||||
|
const int n_vocab_dft = llama_n_vocab(model_dft);
|
||||||
|
const int vocab_diff = n_vocab_tgt > n_vocab_dft
|
||||||
|
? n_vocab_tgt - n_vocab_dft
|
||||||
|
: n_vocab_dft - n_vocab_tgt;
|
||||||
|
|
||||||
|
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
||||||
|
fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
|
||||||
|
fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
||||||
|
n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
|
||||||
|
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
|
||||||
|
const char * token_text_dft = llama_token_get_text(model_dft, i);
|
||||||
|
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
||||||
|
fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
|
||||||
|
fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
|
||||||
|
llama_token_to_piece(ctx_tgt, i).c_str(),
|
||||||
|
llama_token_to_piece(ctx_dft, i).c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
|
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
|
||||||
|
@ -118,7 +148,7 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<seq_draft> drafts(n_seq_dft);
|
std::vector<seq_draft> drafts(n_seq_dft);
|
||||||
|
|
||||||
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
|
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
|
||||||
params.sparams.temp = std::max(0.01f, params.sparams.temp);
|
params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
|
||||||
|
|
||||||
for (int s = 0; s < n_seq_dft; ++s) {
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
|
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
|
||||||
|
@ -227,6 +257,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true);
|
llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
||||||
|
// LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
||||||
llama_decode (ctx_dft, batch_dft);
|
llama_decode (ctx_dft, batch_dft);
|
||||||
|
|
||||||
++n_past_dft;
|
++n_past_dft;
|
||||||
|
@ -370,7 +401,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
|
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
//LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt));
|
// LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
|
||||||
llama_decode(ctx_tgt, batch_tgt);
|
llama_decode(ctx_tgt, batch_tgt);
|
||||||
++n_past_tgt;
|
++n_past_tgt;
|
||||||
}
|
}
|
||||||
|
|
|
@ -349,9 +349,9 @@ static struct ggml_tensor * llama_build_train_graphs(
|
||||||
// not capturing these, to silcence warnings
|
// not capturing these, to silcence warnings
|
||||||
const int rope_mode = 0;
|
const int rope_mode = 0;
|
||||||
|
|
||||||
return ggml_rope_custom(ctx,
|
return ggml_rope_custom(
|
||||||
t, KQ_pos, n_rot, rope_mode, n_ctx,
|
ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
|
||||||
rope_freq_base, rope_freq_scale);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
set_name(tokens_input, "tokens_input");
|
set_name(tokens_input, "tokens_input");
|
||||||
|
|
12
flake.lock
generated
12
flake.lock
generated
|
@ -5,11 +5,11 @@
|
||||||
"systems": "systems"
|
"systems": "systems"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1692799911,
|
"lastModified": 1694529238,
|
||||||
"narHash": "sha256-3eihraek4qL744EvQXsK1Ha6C3CR7nnT8X2qWap4RNk=",
|
"narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
|
||||||
"owner": "numtide",
|
"owner": "numtide",
|
||||||
"repo": "flake-utils",
|
"repo": "flake-utils",
|
||||||
"rev": "f9e7cf818399d17d347f847525c5a5a8032e4e44",
|
"rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1692913444,
|
"lastModified": 1698318101,
|
||||||
"narHash": "sha256-1SvMQm2DwofNxXVtNWWtIcTh7GctEVrS/Xel/mdc6iY=",
|
"narHash": "sha256-gUihHt3yPD7bVqg+k/UVHgngyaJ3DMEBchbymBMvK1E=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "18324978d632ffc55ef1d928e81630c620f4f447",
|
"rev": "63678e9f3d3afecfeafa0acead6239cdb447574c",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
17
flake.nix
17
flake.nix
|
@ -11,8 +11,7 @@
|
||||||
meta.mainProgram = "llama";
|
meta.mainProgram = "llama";
|
||||||
inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
|
inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
|
||||||
buildInputs = with pkgs; [ openmpi ];
|
buildInputs = with pkgs; [ openmpi ];
|
||||||
osSpecific = with pkgs; buildInputs ++
|
osSpecific = with pkgs; buildInputs ++ (
|
||||||
(
|
|
||||||
if isAarch64 && isDarwin then
|
if isAarch64 && isDarwin then
|
||||||
with pkgs.darwin.apple_sdk_11_0.frameworks; [
|
with pkgs.darwin.apple_sdk_11_0.frameworks; [
|
||||||
Accelerate
|
Accelerate
|
||||||
|
@ -51,6 +50,9 @@
|
||||||
};
|
};
|
||||||
llama-python =
|
llama-python =
|
||||||
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
||||||
|
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
|
||||||
|
llama-python-extra =
|
||||||
|
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece torchWithoutCuda transformers ]);
|
||||||
postPatch = ''
|
postPatch = ''
|
||||||
substituteInPlace ./ggml-metal.m \
|
substituteInPlace ./ggml-metal.m \
|
||||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||||
|
@ -93,12 +95,15 @@
|
||||||
};
|
};
|
||||||
packages.rocm = pkgs.stdenv.mkDerivation {
|
packages.rocm = pkgs.stdenv.mkDerivation {
|
||||||
inherit name src meta postPatch nativeBuildInputs postInstall;
|
inherit name src meta postPatch nativeBuildInputs postInstall;
|
||||||
buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ];
|
buildInputs = with pkgs.rocmPackages; buildInputs ++ [ clr hipblas rocblas ];
|
||||||
cmakeFlags = cmakeFlags ++ [
|
cmakeFlags = cmakeFlags ++ [
|
||||||
"-DLLAMA_HIPBLAS=1"
|
"-DLLAMA_HIPBLAS=1"
|
||||||
"-DCMAKE_C_COMPILER=hipcc"
|
"-DCMAKE_C_COMPILER=hipcc"
|
||||||
"-DCMAKE_CXX_COMPILER=hipcc"
|
"-DCMAKE_CXX_COMPILER=hipcc"
|
||||||
"-DCMAKE_POSITION_INDEPENDENT_CODE=ON"
|
# Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
|
||||||
|
# in github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
|
||||||
|
# and select the line that matches the current nixpkgs version of rocBLAS.
|
||||||
|
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
apps.llama-server = {
|
apps.llama-server = {
|
||||||
|
@ -126,5 +131,9 @@
|
||||||
buildInputs = [ llama-python ];
|
buildInputs = [ llama-python ];
|
||||||
packages = nativeBuildInputs ++ osSpecific;
|
packages = nativeBuildInputs ++ osSpecific;
|
||||||
};
|
};
|
||||||
|
devShells.extra = pkgs.mkShell {
|
||||||
|
buildInputs = [ llama-python-extra ];
|
||||||
|
packages = nativeBuildInputs ++ osSpecific;
|
||||||
|
};
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
672
ggml-cuda.cu
672
ggml-cuda.cu
File diff suppressed because it is too large
Load diff
237
ggml-impl.h
Normal file
237
ggml-impl.h
Normal file
|
@ -0,0 +1,237 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
// GGML internal header
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include <string.h> // memcpy
|
||||||
|
#include <math.h> // fabsf
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// static_assert should be a #define, but if it's not,
|
||||||
|
// fall back to the _Static_assert C11 keyword.
|
||||||
|
// if C99 - static_assert is noop
|
||||||
|
// ref: https://stackoverflow.com/a/53923785/4039976
|
||||||
|
#ifndef static_assert
|
||||||
|
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
||||||
|
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||||
|
#else
|
||||||
|
#define static_assert(cond, msg) struct global_scope_noop_trick
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||||
|
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
||||||
|
#ifndef __FMA__
|
||||||
|
#define __FMA__
|
||||||
|
#endif
|
||||||
|
#ifndef __F16C__
|
||||||
|
#define __F16C__
|
||||||
|
#endif
|
||||||
|
#ifndef __SSE3__
|
||||||
|
#define __SSE3__
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#undef MIN
|
||||||
|
#undef MAX
|
||||||
|
|
||||||
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|
||||||
|
// 16-bit float
|
||||||
|
// on Arm, we use __fp16
|
||||||
|
// on x86, we use uint16_t
|
||||||
|
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||||
|
|
||||||
|
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||||
|
//
|
||||||
|
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
||||||
|
//
|
||||||
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
|
||||||
|
#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
|
||||||
|
|
||||||
|
#define GGML_FP16_TO_FP32(x) ((float) (x))
|
||||||
|
#define GGML_FP32_TO_FP16(x) (x)
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#ifdef __wasm_simd128__
|
||||||
|
#include <wasm_simd128.h>
|
||||||
|
#else
|
||||||
|
#ifdef __POWER9_VECTOR__
|
||||||
|
#include <altivec.h>
|
||||||
|
#undef bool
|
||||||
|
#define bool _Bool
|
||||||
|
#else
|
||||||
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
|
#include <intrin.h>
|
||||||
|
#else
|
||||||
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
||||||
|
#if !defined(__riscv)
|
||||||
|
#include <immintrin.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __riscv_v_intrinsic
|
||||||
|
#include <riscv_vector.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __F16C__
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
||||||
|
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
||||||
|
#else
|
||||||
|
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
||||||
|
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#elif defined(__POWER9_VECTOR__)
|
||||||
|
|
||||||
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||||
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||||
|
/* the inline asm below is about 12% faster than the lookup method */
|
||||||
|
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
||||||
|
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||||
|
|
||||||
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||||
|
register float f;
|
||||||
|
register double d;
|
||||||
|
__asm__(
|
||||||
|
"mtfprd %0,%2\n"
|
||||||
|
"xscvhpdp %0,%0\n"
|
||||||
|
"frsp %1,%0\n" :
|
||||||
|
/* temp */ "=d"(d),
|
||||||
|
/* out */ "=f"(f):
|
||||||
|
/* in */ "r"(h));
|
||||||
|
return f;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||||
|
register double d;
|
||||||
|
register ggml_fp16_t r;
|
||||||
|
__asm__( /* xscvdphp can work on double or single precision */
|
||||||
|
"xscvdphp %0,%2\n"
|
||||||
|
"mffprd %1,%0\n" :
|
||||||
|
/* temp */ "=d"(d),
|
||||||
|
/* out */ "=r"(r):
|
||||||
|
/* in */ "f"(f));
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
// FP16 <-> FP32
|
||||||
|
// ref: https://github.com/Maratyszcza/FP16
|
||||||
|
|
||||||
|
static inline float fp32_from_bits(uint32_t w) {
|
||||||
|
union {
|
||||||
|
uint32_t as_bits;
|
||||||
|
float as_value;
|
||||||
|
} fp32;
|
||||||
|
fp32.as_bits = w;
|
||||||
|
return fp32.as_value;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline uint32_t fp32_to_bits(float f) {
|
||||||
|
union {
|
||||||
|
float as_value;
|
||||||
|
uint32_t as_bits;
|
||||||
|
} fp32;
|
||||||
|
fp32.as_value = f;
|
||||||
|
return fp32.as_bits;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||||
|
const uint32_t w = (uint32_t) h << 16;
|
||||||
|
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||||
|
const uint32_t two_w = w + w;
|
||||||
|
|
||||||
|
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
||||||
|
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||||
|
const float exp_scale = 0x1.0p-112f;
|
||||||
|
#else
|
||||||
|
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
||||||
|
#endif
|
||||||
|
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
||||||
|
|
||||||
|
const uint32_t magic_mask = UINT32_C(126) << 23;
|
||||||
|
const float magic_bias = 0.5f;
|
||||||
|
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
||||||
|
|
||||||
|
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
||||||
|
const uint32_t result = sign |
|
||||||
|
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
||||||
|
return fp32_from_bits(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||||
|
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||||
|
const float scale_to_inf = 0x1.0p+112f;
|
||||||
|
const float scale_to_zero = 0x1.0p-110f;
|
||||||
|
#else
|
||||||
|
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
||||||
|
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
||||||
|
#endif
|
||||||
|
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
||||||
|
|
||||||
|
const uint32_t w = fp32_to_bits(f);
|
||||||
|
const uint32_t shl1_w = w + w;
|
||||||
|
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||||
|
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
||||||
|
if (bias < UINT32_C(0x71000000)) {
|
||||||
|
bias = UINT32_C(0x71000000);
|
||||||
|
}
|
||||||
|
|
||||||
|
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
||||||
|
const uint32_t bits = fp32_to_bits(base);
|
||||||
|
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
||||||
|
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
||||||
|
const uint32_t nonsign = exp_bits + mantissa_bits;
|
||||||
|
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||||
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||||
|
|
||||||
|
#endif // __F16C__
|
||||||
|
|
||||||
|
#endif // __ARM_NEON
|
||||||
|
|
||||||
|
// precomputed f32 table for f16 (256 KB)
|
||||||
|
// defined in ggml.c, initialized in ggml_init()
|
||||||
|
extern float ggml_table_f32_f16[1 << 16];
|
||||||
|
|
||||||
|
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
||||||
|
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
||||||
|
// This is also true for POWER9.
|
||||||
|
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
|
||||||
|
|
||||||
|
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||||
|
uint16_t s;
|
||||||
|
memcpy(&s, &f, sizeof(uint16_t));
|
||||||
|
return ggml_table_f32_f16[s];
|
||||||
|
}
|
||||||
|
|
||||||
|
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
||||||
|
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// TODO: backend v2 PR
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
46
ggml-metal.m
46
ggml-metal.m
|
@ -210,6 +210,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||||
|
|
||||||
NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||||
|
if (sourcePath == nil) {
|
||||||
|
GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
|
||||||
|
sourcePath = @"ggml-metal.metal";
|
||||||
|
}
|
||||||
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
|
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
|
||||||
NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
|
NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
|
||||||
if (error) {
|
if (error) {
|
||||||
|
@ -234,12 +238,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
// load kernels
|
// load kernels
|
||||||
{
|
{
|
||||||
NSError * error = nil;
|
NSError * error = nil;
|
||||||
#define GGML_METAL_ADD_KERNEL(name) \
|
|
||||||
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
/*
|
||||||
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
|
||||||
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
||||||
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
|
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
|
||||||
(int) ctx->pipeline_##name.threadExecutionWidth); \
|
(int) ctx->pipeline_##name.threadExecutionWidth); \
|
||||||
|
*/
|
||||||
|
#define GGML_METAL_ADD_KERNEL(name) \
|
||||||
|
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
||||||
|
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
||||||
if (error) { \
|
if (error) { \
|
||||||
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||||
return NULL; \
|
return NULL; \
|
||||||
|
@ -994,11 +1001,15 @@ void ggml_metal_graph_compute(
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
{
|
{
|
||||||
const int nth = MIN(32, ne00);
|
int nth = 32; // SIMD width
|
||||||
|
|
||||||
if (ne00%4 == 0) {
|
if (ne00%4 == 0) {
|
||||||
[encoder setComputePipelineState:ctx->pipeline_soft_max_4];
|
[encoder setComputePipelineState:ctx->pipeline_soft_max_4];
|
||||||
} else {
|
} else {
|
||||||
|
do {
|
||||||
|
nth *= 2;
|
||||||
|
} while (nth <= ne00 && nth <= 1024);
|
||||||
|
nth /= 2;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_soft_max];
|
[encoder setComputePipelineState:ctx->pipeline_soft_max];
|
||||||
}
|
}
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
@ -1006,8 +1017,9 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||||
|
[encoder setThreadgroupMemoryLength:MAX(16, nth/32*sizeof(float)) atIndex:0];
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
{
|
{
|
||||||
|
@ -1336,7 +1348,7 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
||||||
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
||||||
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
[encoder setThreadgroupMemoryLength:MAX(16, nth*sizeof(float)) atIndex:0];
|
||||||
|
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
const int64_t nrows = ggml_nrows(src0);
|
||||||
|
|
||||||
|
@ -1391,11 +1403,16 @@ void ggml_metal_graph_compute(
|
||||||
const int n_past = ((int32_t *) dst->op_params)[0];
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
||||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||||
const int mode = ((int32_t *) dst->op_params)[2];
|
const int mode = ((int32_t *) dst->op_params)[2];
|
||||||
|
// skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
|
||||||
|
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
||||||
|
|
||||||
float freq_base;
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||||
float freq_scale;
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||||
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
||||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
||||||
|
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
||||||
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||||
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||||
|
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break;
|
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break;
|
||||||
|
@ -1425,8 +1442,13 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBytes:&n_past length:sizeof( int) atIndex:19];
|
[encoder setBytes:&n_past length:sizeof( int) atIndex:19];
|
||||||
[encoder setBytes:&n_dims length:sizeof( int) atIndex:20];
|
[encoder setBytes:&n_dims length:sizeof( int) atIndex:20];
|
||||||
[encoder setBytes:&mode length:sizeof( int) atIndex:21];
|
[encoder setBytes:&mode length:sizeof( int) atIndex:21];
|
||||||
[encoder setBytes:&freq_base length:sizeof(float) atIndex:22];
|
[encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:22];
|
||||||
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:23];
|
[encoder setBytes:&freq_base length:sizeof( float) atIndex:23];
|
||||||
|
[encoder setBytes:&freq_scale length:sizeof( float) atIndex:24];
|
||||||
|
[encoder setBytes:&ext_factor length:sizeof( float) atIndex:25];
|
||||||
|
[encoder setBytes:&attn_factor length:sizeof( float) atIndex:26];
|
||||||
|
[encoder setBytes:&beta_fast length:sizeof( float) atIndex:27];
|
||||||
|
[encoder setBytes:&beta_slow length:sizeof( float) atIndex:28];
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
|
|
194
ggml-metal.metal
194
ggml-metal.metal
|
@ -184,36 +184,73 @@ kernel void kernel_soft_max(
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
constant int64_t & ne01,
|
||||||
constant int64_t & ne02,
|
constant int64_t & ne02,
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
threadgroup float * buf [[threadgroup(0)]],
|
||||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
uint tgpig[[threadgroup_position_in_grid]],
|
||||||
uint3 ntg[[threads_per_threadgroup]]) {
|
uint tpitg[[thread_position_in_threadgroup]],
|
||||||
const int64_t i03 = tgpig[2];
|
uint sgitg[[simdgroup_index_in_threadgroup]],
|
||||||
const int64_t i02 = tgpig[1];
|
uint tiisg[[thread_index_in_simdgroup]],
|
||||||
const int64_t i01 = tgpig[0];
|
uint ntg[[threads_per_threadgroup]]) {
|
||||||
|
const int64_t i03 = (tgpig) / (ne02*ne01);
|
||||||
|
const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
|
||||||
|
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
|
||||||
|
|
||||||
device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||||
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||||
|
|
||||||
// parallel max
|
// parallel max
|
||||||
float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY;
|
float lmax = tpitg < ne00 ? psrc0[tpitg] : -INFINITY;
|
||||||
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
|
|
||||||
|
for (int i00 = tpitg + ntg; i00 < ne00; i00 += ntg) {
|
||||||
lmax = MAX(lmax, psrc0[i00]);
|
lmax = MAX(lmax, psrc0[i00]);
|
||||||
}
|
}
|
||||||
const float max = simd_max(lmax);
|
|
||||||
|
float max = simd_max(lmax);
|
||||||
|
if (tiisg == 0) {
|
||||||
|
buf[sgitg] = max;
|
||||||
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
|
// broadcast, simd group number is ntg / 32
|
||||||
|
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
||||||
|
if (tpitg < i) {
|
||||||
|
buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
|
max = buf[0];
|
||||||
|
|
||||||
// parallel sum
|
// parallel sum
|
||||||
float lsum = 0.0f;
|
float lsum = 0.0f;
|
||||||
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||||
const float exp_psrc0 = exp(psrc0[i00] - max);
|
const float exp_psrc0 = exp(psrc0[i00] - max);
|
||||||
lsum += exp_psrc0;
|
lsum += exp_psrc0;
|
||||||
// Remember the result of exp here. exp is expensive, so we really do not
|
// Remember the result of exp here. exp is expensive, so we really do not
|
||||||
// whish to compute it twice.
|
// wish to compute it twice.
|
||||||
pdst[i00] = exp_psrc0;
|
pdst[i00] = exp_psrc0;
|
||||||
}
|
}
|
||||||
|
|
||||||
const float sum = simd_sum(lsum);
|
float sum = simd_sum(lsum);
|
||||||
|
if (tiisg == 0) {
|
||||||
|
buf[sgitg] = sum;
|
||||||
|
}
|
||||||
|
|
||||||
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
|
// broadcast, simd group number is ntg / 32
|
||||||
|
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
||||||
|
if (tpitg < i) {
|
||||||
|
buf[tpitg] += buf[tpitg + i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
|
sum = buf[0];
|
||||||
|
|
||||||
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||||
pdst[i00] /= sum;
|
pdst[i00] /= sum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -224,37 +261,73 @@ kernel void kernel_soft_max_4(
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
constant int64_t & ne01,
|
||||||
constant int64_t & ne02,
|
constant int64_t & ne02,
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
threadgroup float * buf [[threadgroup(0)]],
|
||||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
uint tgpig[[threadgroup_position_in_grid]],
|
||||||
uint3 ntg[[threads_per_threadgroup]]) {
|
uint tpitg[[thread_position_in_threadgroup]],
|
||||||
const int64_t i03 = tgpig[2];
|
uint sgitg[[simdgroup_index_in_threadgroup]],
|
||||||
const int64_t i02 = tgpig[1];
|
uint tiisg[[thread_index_in_simdgroup]],
|
||||||
const int64_t i01 = tgpig[0];
|
uint ntg[[threads_per_threadgroup]]) {
|
||||||
|
const int64_t i03 = (tgpig) / (ne02*ne01);
|
||||||
|
const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
|
||||||
|
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
|
||||||
|
|
||||||
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||||
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||||
|
|
||||||
// parallel max
|
// parallel max
|
||||||
float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY;
|
float4 lmax4 = tpitg < ne00/4 ? psrc4[tpitg] : -INFINITY;
|
||||||
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
|
|
||||||
|
for (int i00 = tpitg + ntg; i00 < ne00/4; i00 += ntg) {
|
||||||
lmax4 = fmax(lmax4, psrc4[i00]);
|
lmax4 = fmax(lmax4, psrc4[i00]);
|
||||||
}
|
}
|
||||||
float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
|
||||||
|
|
||||||
const float max = simd_max(lmax);
|
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
||||||
|
float max = simd_max(lmax);
|
||||||
|
if (tiisg == 0) {
|
||||||
|
buf[sgitg] = max;
|
||||||
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
|
// broadcast, simd group number is ntg / 32
|
||||||
|
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
||||||
|
if (tpitg < i) {
|
||||||
|
buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
|
max = buf[0];
|
||||||
|
|
||||||
// parallel sum
|
// parallel sum
|
||||||
float4 lsum4 = 0.0f;
|
float4 lsum4 = 0.0f;
|
||||||
for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||||
const float4 exp_psrc4 = exp(psrc4[i00] - max);
|
const float4 exp_psrc4 = exp(psrc4[i00] - max);
|
||||||
lsum4 += exp_psrc4;
|
lsum4 += exp_psrc4;
|
||||||
pdst4[i00] = exp_psrc4;
|
pdst4[i00] = exp_psrc4;
|
||||||
}
|
}
|
||||||
float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
|
|
||||||
|
|
||||||
const float sum = simd_sum(lsum);
|
const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
|
||||||
|
float sum = simd_sum(lsum);
|
||||||
|
if (tiisg == 0) {
|
||||||
|
buf[sgitg] = sum;
|
||||||
|
}
|
||||||
|
|
||||||
for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
|
// broadcast, simd group number is ntg / 32
|
||||||
|
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
||||||
|
if (tpitg < i) {
|
||||||
|
buf[tpitg] += buf[tpitg + i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
|
sum = buf[0];
|
||||||
|
|
||||||
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||||
pdst4[i00] /= sum;
|
pdst4[i00] /= sum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -988,6 +1061,45 @@ kernel void kernel_alibi_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
||||||
|
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
||||||
|
return 1.0f - min(1.0f, max(0.0f, y));
|
||||||
|
}
|
||||||
|
|
||||||
|
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
||||||
|
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
||||||
|
static void rope_yarn(
|
||||||
|
float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
|
||||||
|
thread float * cos_theta, thread float * sin_theta
|
||||||
|
) {
|
||||||
|
// Get n-d rotational scaling corrected for extrapolation
|
||||||
|
float theta_interp = freq_scale * theta_extrap;
|
||||||
|
float theta = theta_interp;
|
||||||
|
if (ext_factor != 0.0f) {
|
||||||
|
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
|
||||||
|
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||||
|
|
||||||
|
// Get n-d magnitude scaling corrected for interpolation
|
||||||
|
mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
|
||||||
|
}
|
||||||
|
*cos_theta = cos(theta) * mscale;
|
||||||
|
*sin_theta = sin(theta) * mscale;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
||||||
|
// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
||||||
|
static float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
|
||||||
|
return n_dims * log(n_orig_ctx / (n_rot * 2 * M_PI_F)) / (2 * log(base));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void rope_yarn_corr_dims(
|
||||||
|
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
||||||
|
) {
|
||||||
|
// start and end correction dims
|
||||||
|
dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
|
||||||
|
dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
|
||||||
|
}
|
||||||
|
|
||||||
typedef void (rope_t)(
|
typedef void (rope_t)(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const int32_t * src1,
|
device const int32_t * src1,
|
||||||
|
@ -1011,8 +1123,13 @@ typedef void (rope_t)(
|
||||||
constant int & n_past,
|
constant int & n_past,
|
||||||
constant int & n_dims,
|
constant int & n_dims,
|
||||||
constant int & mode,
|
constant int & mode,
|
||||||
|
constant int & n_orig_ctx,
|
||||||
constant float & freq_base,
|
constant float & freq_base,
|
||||||
constant float & freq_scale,
|
constant float & freq_scale,
|
||||||
|
constant float & ext_factor,
|
||||||
|
constant float & attn_factor,
|
||||||
|
constant float & beta_fast,
|
||||||
|
constant float & beta_slow,
|
||||||
uint tiitg[[thread_index_in_threadgroup]],
|
uint tiitg[[thread_index_in_threadgroup]],
|
||||||
uint3 tptg[[threads_per_threadgroup]],
|
uint3 tptg[[threads_per_threadgroup]],
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]]);
|
uint3 tgpig[[threadgroup_position_in_grid]]);
|
||||||
|
@ -1041,8 +1158,13 @@ kernel void kernel_rope(
|
||||||
constant int & n_past,
|
constant int & n_past,
|
||||||
constant int & n_dims,
|
constant int & n_dims,
|
||||||
constant int & mode,
|
constant int & mode,
|
||||||
|
constant int & n_orig_ctx,
|
||||||
constant float & freq_base,
|
constant float & freq_base,
|
||||||
constant float & freq_scale,
|
constant float & freq_scale,
|
||||||
|
constant float & ext_factor,
|
||||||
|
constant float & attn_factor,
|
||||||
|
constant float & beta_fast,
|
||||||
|
constant float & beta_slow,
|
||||||
uint tiitg[[thread_index_in_threadgroup]],
|
uint tiitg[[thread_index_in_threadgroup]],
|
||||||
uint3 tptg[[threads_per_threadgroup]],
|
uint3 tptg[[threads_per_threadgroup]],
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
||||||
|
@ -1052,19 +1174,22 @@ kernel void kernel_rope(
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
|
|
||||||
|
float corr_dims[2];
|
||||||
|
rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
|
||||||
|
|
||||||
device const int32_t * pos = src1;
|
device const int32_t * pos = src1;
|
||||||
|
|
||||||
const int64_t p = pos[i2];
|
const int64_t p = pos[i2];
|
||||||
|
|
||||||
const float theta_0 = freq_scale * (float)p;
|
const float theta_0 = (float)p;
|
||||||
const float inv_ndims = -1.f/n_dims;
|
const float inv_ndims = -1.f/n_dims;
|
||||||
|
|
||||||
if (!is_neox) {
|
if (!is_neox) {
|
||||||
for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
|
for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
|
||||||
|
|
||||||
const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
|
const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
|
||||||
const float cos_theta = cos(theta);
|
float cos_theta, sin_theta;
|
||||||
const float sin_theta = sin(theta);
|
rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||||
|
|
||||||
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
@ -1079,9 +1204,12 @@ kernel void kernel_rope(
|
||||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||||
for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
|
for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
|
||||||
|
|
||||||
const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
||||||
const float cos_theta = cos(theta);
|
const float cur_rot = inv_ndims*ic - ib;
|
||||||
const float sin_theta = sin(theta);
|
|
||||||
|
const float theta = theta_0 * pow(freq_base, cur_rot);
|
||||||
|
float cos_theta, sin_theta;
|
||||||
|
rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||||
|
|
||||||
const int64_t i0 = ib*n_dims + ic/2;
|
const int64_t i0 = ib*n_dims + ic/2;
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,11 +1,63 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml-impl.h"
|
||||||
|
|
||||||
|
// GGML internal header
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <assert.h>
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
|
|
||||||
|
#define QK4_0 32
|
||||||
|
typedef struct {
|
||||||
|
ggml_fp16_t d; // delta
|
||||||
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
||||||
|
} block_q4_0;
|
||||||
|
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
||||||
|
|
||||||
|
#define QK4_1 32
|
||||||
|
typedef struct {
|
||||||
|
ggml_fp16_t d; // delta
|
||||||
|
ggml_fp16_t m; // min
|
||||||
|
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
||||||
|
} block_q4_1;
|
||||||
|
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
||||||
|
|
||||||
|
#define QK5_0 32
|
||||||
|
typedef struct {
|
||||||
|
ggml_fp16_t d; // delta
|
||||||
|
uint8_t qh[4]; // 5-th bit of quants
|
||||||
|
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
||||||
|
} block_q5_0;
|
||||||
|
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
|
||||||
|
|
||||||
|
#define QK5_1 32
|
||||||
|
typedef struct {
|
||||||
|
ggml_fp16_t d; // delta
|
||||||
|
ggml_fp16_t m; // min
|
||||||
|
uint8_t qh[4]; // 5-th bit of quants
|
||||||
|
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
||||||
|
} block_q5_1;
|
||||||
|
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
|
||||||
|
|
||||||
|
#define QK8_0 32
|
||||||
|
typedef struct {
|
||||||
|
ggml_fp16_t d; // delta
|
||||||
|
int8_t qs[QK8_0]; // quants
|
||||||
|
} block_q8_0;
|
||||||
|
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
||||||
|
|
||||||
|
#define QK8_1 32
|
||||||
|
typedef struct {
|
||||||
|
float d; // delta
|
||||||
|
float s; // d * sum(qs[i])
|
||||||
|
int8_t qs[QK8_1]; // quants
|
||||||
|
} block_q8_1;
|
||||||
|
static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
|
||||||
|
|
||||||
|
//
|
||||||
|
// Super-block quantization structures
|
||||||
|
//
|
||||||
|
|
||||||
// Super-block size
|
// Super-block size
|
||||||
#ifdef GGML_QKK_64
|
#ifdef GGML_QKK_64
|
||||||
#define QK_K 64
|
#define QK_K 64
|
||||||
|
@ -15,18 +67,6 @@
|
||||||
#define K_SCALE_SIZE 12
|
#define K_SCALE_SIZE 12
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef static_assert
|
|
||||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
|
||||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
|
||||||
#else
|
|
||||||
#define static_assert(cond, msg) struct global_scope_noop_trick
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//
|
|
||||||
// Super-block quantization structures
|
|
||||||
//
|
|
||||||
|
|
||||||
// 2-bit quantization
|
// 2-bit quantization
|
||||||
// weight is represented as x = a * q + b
|
// weight is represented as x = a * q + b
|
||||||
// 16 blocks of 16 elements each
|
// 16 blocks of 16 elements each
|
||||||
|
@ -127,6 +167,13 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
|
||||||
|
|
||||||
|
|
||||||
// Quantization
|
// Quantization
|
||||||
|
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
|
||||||
|
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
|
||||||
|
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
|
||||||
|
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
|
||||||
|
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
|
||||||
|
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
|
||||||
|
|
||||||
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
|
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
|
||||||
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
|
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
|
||||||
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
|
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
|
||||||
|
@ -134,6 +181,13 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
||||||
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
||||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
||||||
|
|
||||||
|
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
|
||||||
|
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
|
||||||
|
void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
|
||||||
|
void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
|
||||||
|
void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
|
||||||
|
void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
|
||||||
|
|
||||||
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
||||||
|
@ -142,6 +196,13 @@ void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
||||||
|
|
||||||
// Dequantization
|
// Dequantization
|
||||||
|
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
|
||||||
|
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
|
||||||
|
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
|
||||||
|
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
|
||||||
|
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
|
||||||
|
//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
|
||||||
|
|
||||||
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
|
||||||
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
|
||||||
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
|
||||||
|
@ -150,16 +211,14 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
|
||||||
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
||||||
|
|
||||||
// Dot product
|
// Dot product
|
||||||
|
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
|
||||||
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
|
||||||
// Quantization with histogram collection
|
|
||||||
size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
|
|
42
ggml.h
42
ggml.h
|
@ -219,7 +219,7 @@
|
||||||
#define GGML_MAX_CONTEXTS 64
|
#define GGML_MAX_CONTEXTS 64
|
||||||
#define GGML_MAX_SRC 6
|
#define GGML_MAX_SRC 6
|
||||||
#define GGML_MAX_NAME 64
|
#define GGML_MAX_NAME 64
|
||||||
#define GGML_MAX_OP_PARAMS 32
|
#define GGML_MAX_OP_PARAMS 64
|
||||||
#define GGML_DEFAULT_N_THREADS 4
|
#define GGML_DEFAULT_N_THREADS 4
|
||||||
|
|
||||||
#if UINTPTR_MAX == 0xFFFFFFFF
|
#if UINTPTR_MAX == 0xFFFFFFFF
|
||||||
|
@ -401,15 +401,16 @@ extern "C" {
|
||||||
GGML_OP_ALIBI,
|
GGML_OP_ALIBI,
|
||||||
GGML_OP_CLAMP,
|
GGML_OP_CLAMP,
|
||||||
GGML_OP_CONV_1D,
|
GGML_OP_CONV_1D,
|
||||||
GGML_OP_CONV_2D,
|
GGML_OP_CONV_1D_STAGE_0, // internal
|
||||||
|
GGML_OP_CONV_1D_STAGE_1, // internal
|
||||||
GGML_OP_CONV_TRANSPOSE_1D,
|
GGML_OP_CONV_TRANSPOSE_1D,
|
||||||
|
GGML_OP_CONV_2D,
|
||||||
|
GGML_OP_CONV_2D_STAGE_0, // internal
|
||||||
|
GGML_OP_CONV_2D_STAGE_1, // internal
|
||||||
GGML_OP_CONV_TRANSPOSE_2D,
|
GGML_OP_CONV_TRANSPOSE_2D,
|
||||||
GGML_OP_POOL_1D,
|
GGML_OP_POOL_1D,
|
||||||
GGML_OP_POOL_2D,
|
GGML_OP_POOL_2D,
|
||||||
|
|
||||||
GGML_OP_CONV_1D_STAGE_0, // internal
|
|
||||||
GGML_OP_CONV_1D_STAGE_1, // internal
|
|
||||||
|
|
||||||
GGML_OP_UPSCALE, // nearest interpolate
|
GGML_OP_UPSCALE, // nearest interpolate
|
||||||
|
|
||||||
GGML_OP_FLASH_ATTN,
|
GGML_OP_FLASH_ATTN,
|
||||||
|
@ -1020,9 +1021,9 @@ extern "C" {
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
float eps);
|
float eps);
|
||||||
|
|
||||||
// A: n columns, m rows
|
// A: k columns, n rows => [ne03, ne02, n, k]
|
||||||
// B: n columns, p rows (i.e. we transpose it internally)
|
// B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
|
||||||
// result is m columns, p rows
|
// result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
|
||||||
GGML_API struct ggml_tensor * ggml_mul_mat(
|
GGML_API struct ggml_tensor * ggml_mul_mat(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
@ -1325,8 +1326,13 @@ extern "C" {
|
||||||
int n_dims,
|
int n_dims,
|
||||||
int mode,
|
int mode,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
|
int n_orig_ctx,
|
||||||
float freq_base,
|
float freq_base,
|
||||||
float freq_scale);
|
float freq_scale,
|
||||||
|
float ext_factor,
|
||||||
|
float attn_factor,
|
||||||
|
float beta_fast,
|
||||||
|
float beta_slow);
|
||||||
|
|
||||||
// in-place, returns view(a)
|
// in-place, returns view(a)
|
||||||
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
||||||
|
@ -1336,8 +1342,17 @@ extern "C" {
|
||||||
int n_dims,
|
int n_dims,
|
||||||
int mode,
|
int mode,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
|
int n_orig_ctx,
|
||||||
float freq_base,
|
float freq_base,
|
||||||
float freq_scale);
|
float freq_scale,
|
||||||
|
float ext_factor,
|
||||||
|
float attn_factor,
|
||||||
|
float beta_fast,
|
||||||
|
float beta_slow);
|
||||||
|
|
||||||
|
// compute correction dims for YaRN RoPE scaling
|
||||||
|
void ggml_rope_yarn_corr_dims(
|
||||||
|
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
||||||
|
|
||||||
// xPos RoPE, in-place, returns view(a)
|
// xPos RoPE, in-place, returns view(a)
|
||||||
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
||||||
|
@ -1929,12 +1944,19 @@ extern "C" {
|
||||||
// quantization
|
// quantization
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
||||||
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
|
||||||
|
GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
|
||||||
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
|
@ -7,7 +7,7 @@ import shutil
|
||||||
import struct
|
import struct
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
from enum import IntEnum, auto
|
from enum import Enum, IntEnum, auto
|
||||||
from io import BufferedWriter
|
from io import BufferedWriter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import IO, Any, BinaryIO, Callable, Sequence
|
from typing import IO, Any, BinaryIO, Callable, Sequence
|
||||||
|
@ -55,7 +55,10 @@ KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
||||||
# RoPE
|
# RoPE
|
||||||
KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||||
KEY_ROPE_FREQ_BASE = "{arch}.rope.freq_base"
|
KEY_ROPE_FREQ_BASE = "{arch}.rope.freq_base"
|
||||||
KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear"
|
KEY_ROPE_SCALING_TYPE = "{arch}.rope.scaling.type"
|
||||||
|
KEY_ROPE_SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
||||||
|
KEY_ROPE_SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
||||||
|
KEY_ROPE_SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||||
|
|
||||||
# tokenization
|
# tokenization
|
||||||
KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
|
KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
|
||||||
|
@ -408,6 +411,7 @@ class TensorNameMap:
|
||||||
"layers.{bid}.attention_norm", # llama-pth
|
"layers.{bid}.attention_norm", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||||
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
||||||
|
"model.layers.{bid}.ln1", # yi
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention norm 2
|
# Attention norm 2
|
||||||
|
@ -479,6 +483,7 @@ class TensorNameMap:
|
||||||
"layers.{bid}.ffn_norm", # llama-pth
|
"layers.{bid}.ffn_norm", # llama-pth
|
||||||
"encoder.layer.{bid}.output.LayerNorm", # bert
|
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||||
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
||||||
|
"model.layers.{bid}.ln2", # yi
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward up
|
# Feed-forward up
|
||||||
|
@ -595,6 +600,11 @@ class TokenType(IntEnum):
|
||||||
UNUSED = 5
|
UNUSED = 5
|
||||||
BYTE = 6
|
BYTE = 6
|
||||||
|
|
||||||
|
class RopeScalingType(Enum):
|
||||||
|
NONE = 'none'
|
||||||
|
LINEAR = 'linear'
|
||||||
|
YARN = 'yarn'
|
||||||
|
|
||||||
#
|
#
|
||||||
# implementation
|
# implementation
|
||||||
#
|
#
|
||||||
|
@ -966,8 +976,17 @@ class GGUFWriter:
|
||||||
def add_rope_freq_base(self, value: float):
|
def add_rope_freq_base(self, value: float):
|
||||||
self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value)
|
self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_rope_scale_linear(self, value: float):
|
def add_rope_scaling_type(self, value: RopeScalingType):
|
||||||
self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
|
self.add_string(KEY_ROPE_SCALING_TYPE.format(arch=self.arch), value.value)
|
||||||
|
|
||||||
|
def add_rope_scaling_factor(self, value: float):
|
||||||
|
self.add_float32(KEY_ROPE_SCALING_FACTOR.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_rope_scaling_orig_ctx_len(self, value: int):
|
||||||
|
self.add_uint32(KEY_ROPE_SCALING_ORIG_CTX_LEN.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_rope_scaling_finetuned(self, value: bool):
|
||||||
|
self.add_bool(KEY_ROPE_SCALING_FINETUNED.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_tokenizer_model(self, model: str):
|
def add_tokenizer_model(self, model: str):
|
||||||
self.add_string(KEY_TOKENIZER_MODEL, model)
|
self.add_string(KEY_TOKENIZER_MODEL, model)
|
||||||
|
|
36
llama.h
36
llama.h
|
@ -106,6 +106,14 @@ extern "C" {
|
||||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum llama_rope_scaling_type {
|
||||||
|
LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
|
||||||
|
LLAMA_ROPE_SCALING_NONE = 0,
|
||||||
|
LLAMA_ROPE_SCALING_LINEAR = 1,
|
||||||
|
LLAMA_ROPE_SCALING_YARN = 2,
|
||||||
|
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
||||||
|
};
|
||||||
|
|
||||||
typedef struct llama_token_data {
|
typedef struct llama_token_data {
|
||||||
llama_token id; // token id
|
llama_token id; // token id
|
||||||
float logit; // log-odds of the token
|
float logit; // log-odds of the token
|
||||||
|
@ -172,13 +180,19 @@ extern "C" {
|
||||||
uint32_t n_batch; // prompt processing maximum batch size
|
uint32_t n_batch; // prompt processing maximum batch size
|
||||||
uint32_t n_threads; // number of threads to use for generation
|
uint32_t n_threads; // number of threads to use for generation
|
||||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||||
|
int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||||
|
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||||
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
||||||
|
float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
|
||||||
|
float yarn_attn_factor; // YaRN magnitude scaling factor
|
||||||
|
float yarn_beta_fast; // YaRN low correction dim
|
||||||
|
float yarn_beta_slow; // YaRN high correction dim
|
||||||
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
||||||
|
|
||||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
||||||
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
||||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||||
bool embedding; // embedding mode only
|
bool embedding; // embedding mode only
|
||||||
|
@ -191,6 +205,7 @@ extern "C" {
|
||||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||||
bool quantize_output_tensor; // quantize output.weight
|
bool quantize_output_tensor; // quantize output.weight
|
||||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||||
|
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
|
||||||
} llama_model_quantize_params;
|
} llama_model_quantize_params;
|
||||||
|
|
||||||
// grammar types
|
// grammar types
|
||||||
|
@ -333,15 +348,12 @@ extern "C" {
|
||||||
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
||||||
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
||||||
|
|
||||||
// Remove all tokens data of cells in [c0, c1)
|
// Clear the KV cache
|
||||||
// c0 < 0 : [0, c1]
|
LLAMA_API void llama_kv_cache_clear(
|
||||||
// c1 < 0 : [c0, inf)
|
struct llama_context * ctx);
|
||||||
LLAMA_API void llama_kv_cache_tokens_rm(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
int32_t c0,
|
|
||||||
int32_t c1);
|
|
||||||
|
|
||||||
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||||
|
// seq_id < 0 : match any sequence
|
||||||
// p0 < 0 : [0, p1]
|
// p0 < 0 : [0, p1]
|
||||||
// p1 < 0 : [p0, inf)
|
// p1 < 0 : [p0, inf)
|
||||||
LLAMA_API void llama_kv_cache_seq_rm(
|
LLAMA_API void llama_kv_cache_seq_rm(
|
||||||
|
@ -600,6 +612,13 @@ extern "C" {
|
||||||
float p,
|
float p,
|
||||||
size_t min_keep);
|
size_t min_keep);
|
||||||
|
|
||||||
|
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
|
||||||
|
LLAMA_API void llama_sample_min_p(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
llama_token_data_array * candidates,
|
||||||
|
float p,
|
||||||
|
size_t min_keep);
|
||||||
|
|
||||||
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
||||||
LLAMA_API void llama_sample_tail_free(
|
LLAMA_API void llama_sample_tail_free(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
|
@ -658,6 +677,7 @@ extern "C" {
|
||||||
float * mu);
|
float * mu);
|
||||||
|
|
||||||
/// @details Selects the token with the highest probability.
|
/// @details Selects the token with the highest probability.
|
||||||
|
/// Does not compute the token probabilities. Use llama_sample_softmax() instead.
|
||||||
LLAMA_API llama_token llama_sample_token_greedy(
|
LLAMA_API llama_token llama_sample_token_greedy(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
llama_token_data_array * candidates);
|
llama_token_data_array * candidates);
|
||||||
|
|
Binary file not shown.
|
@ -1,5 +1,5 @@
|
||||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.h.in")
|
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
|
||||||
set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
|
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
|
||||||
set(BUILD_NUMBER 0)
|
set(BUILD_NUMBER 0)
|
||||||
set(BUILD_COMMIT "unknown")
|
set(BUILD_COMMIT "unknown")
|
||||||
set(BUILD_COMPILER "unknown")
|
set(BUILD_COMPILER "unknown")
|
||||||
|
@ -24,16 +24,22 @@ if(Git_FOUND)
|
||||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||||
OUTPUT_VARIABLE HEAD
|
OUTPUT_VARIABLE HEAD
|
||||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||||
|
RESULT_VARIABLE RES
|
||||||
)
|
)
|
||||||
|
if (RES EQUAL 0)
|
||||||
|
set(BUILD_COMMIT ${HEAD})
|
||||||
|
endif()
|
||||||
execute_process(
|
execute_process(
|
||||||
COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
|
COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
|
||||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||||
OUTPUT_VARIABLE COUNT
|
OUTPUT_VARIABLE COUNT
|
||||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||||
|
RESULT_VARIABLE RES
|
||||||
)
|
)
|
||||||
set(BUILD_COMMIT ${HEAD})
|
if (RES EQUAL 0)
|
||||||
set(BUILD_NUMBER ${COUNT})
|
set(BUILD_NUMBER ${COUNT})
|
||||||
endif()
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
if(MSVC)
|
if(MSVC)
|
||||||
set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
|
set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
|
||||||
|
@ -53,22 +59,22 @@ else()
|
||||||
set(BUILD_TARGET ${OUT})
|
set(BUILD_TARGET ${OUT})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Only write the header if it's changed to prevent unnecessary recompilation
|
# Only write the build info if it changed
|
||||||
if(EXISTS ${HEADER_FILE})
|
if(EXISTS ${OUTPUT_FILE})
|
||||||
file(READ ${HEADER_FILE} CONTENTS)
|
file(READ ${OUTPUT_FILE} CONTENTS)
|
||||||
string(REGEX MATCH "BUILD_COMMIT \"([^\"]*)\"" _ ${CONTENTS})
|
string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
|
||||||
set(OLD_COMMIT ${CMAKE_MATCH_1})
|
set(OLD_COMMIT ${CMAKE_MATCH_1})
|
||||||
string(REGEX MATCH "BUILD_COMPILER \"([^\"]*)\"" _ ${CONTENTS})
|
string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
|
||||||
set(OLD_COMPILER ${CMAKE_MATCH_1})
|
set(OLD_COMPILER ${CMAKE_MATCH_1})
|
||||||
string(REGEX MATCH "BUILD_TARGET \"([^\"]*)\"" _ ${CONTENTS})
|
string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
|
||||||
set(OLD_TARGET ${CMAKE_MATCH_1})
|
set(OLD_TARGET ${CMAKE_MATCH_1})
|
||||||
if (
|
if (
|
||||||
NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
|
NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
|
||||||
NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
|
NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
|
||||||
NOT OLD_TARGET STREQUAL BUILD_TARGET
|
NOT OLD_TARGET STREQUAL BUILD_TARGET
|
||||||
)
|
)
|
||||||
configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
|
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
|
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -1,9 +0,0 @@
|
||||||
#ifndef BUILD_INFO_H
|
|
||||||
#define BUILD_INFO_H
|
|
||||||
|
|
||||||
#define BUILD_NUMBER @BUILD_NUMBER@
|
|
||||||
#define BUILD_COMMIT "@BUILD_COMMIT@"
|
|
||||||
#define BUILD_COMPILER "@BUILD_COMPILER@"
|
|
||||||
#define BUILD_TARGET "@BUILD_TARGET@"
|
|
||||||
|
|
||||||
#endif // BUILD_INFO_H
|
|
|
@ -24,12 +24,7 @@ if out=$($CC -dumpmachine); then
|
||||||
build_target=$out
|
build_target=$out
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "#ifndef BUILD_INFO_H"
|
echo "int LLAMA_BUILD_NUMBER = ${build_number};"
|
||||||
echo "#define BUILD_INFO_H"
|
echo "char const *LLAMA_COMMIT = \"${build_commit}\";"
|
||||||
echo
|
echo "char const *LLAMA_COMPILER = \"${build_compiler}\";"
|
||||||
echo "#define BUILD_NUMBER $build_number"
|
echo "char const *LLAMA_BUILD_TARGET = \"${build_target}\";"
|
||||||
echo "#define BUILD_COMMIT \"$build_commit\""
|
|
||||||
echo "#define BUILD_COMPILER \"$build_compiler\""
|
|
||||||
echo "#define BUILD_TARGET \"$build_target\""
|
|
||||||
echo
|
|
||||||
echo "#endif // BUILD_INFO_H"
|
|
||||||
|
|
391
scripts/server-llm.sh
Normal file
391
scripts/server-llm.sh
Normal file
|
@ -0,0 +1,391 @@
|
||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Helper script for deploying llama.cpp server with a single Bash command
|
||||||
|
#
|
||||||
|
# - Works on Linux and macOS
|
||||||
|
# - Supports: CPU, CUDA, Metal, OpenCL
|
||||||
|
# - Can run all GGUF models from HuggingFace
|
||||||
|
# - Can serve requests in parallel
|
||||||
|
# - Always builds latest llama.cpp from GitHub
|
||||||
|
#
|
||||||
|
# Limitations
|
||||||
|
#
|
||||||
|
# - Chat templates are poorly supported (base models recommended)
|
||||||
|
# - Might be unstable!
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
|
||||||
|
#
|
||||||
|
# --port: port number, default is 8888
|
||||||
|
# --repo: path to a repo containing GGUF model files
|
||||||
|
# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input
|
||||||
|
# --backend: cpu, cuda, metal, opencl, depends on the OS
|
||||||
|
# --gpu-id: gpu id, default is 0
|
||||||
|
# --n-parallel: number of parallel requests, default is 8
|
||||||
|
# --n-kv: KV cache size, default is 4096
|
||||||
|
# --verbose: verbose output
|
||||||
|
#
|
||||||
|
# Example:
|
||||||
|
#
|
||||||
|
# bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
|
||||||
|
#
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# required utils: curl, git, make
|
||||||
|
if ! command -v curl &> /dev/null; then
|
||||||
|
printf "[-] curl not found\n"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if ! command -v git &> /dev/null; then
|
||||||
|
printf "[-] git not found\n"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if ! command -v make &> /dev/null; then
|
||||||
|
printf "[-] make not found\n"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# parse arguments
|
||||||
|
port=8888
|
||||||
|
repo=""
|
||||||
|
wtype=""
|
||||||
|
backend="cpu"
|
||||||
|
|
||||||
|
# if macOS, use metal backend by default
|
||||||
|
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||||
|
backend="metal"
|
||||||
|
elif command -v nvcc &> /dev/null; then
|
||||||
|
backend="cuda"
|
||||||
|
fi
|
||||||
|
|
||||||
|
gpu_id=0
|
||||||
|
n_parallel=8
|
||||||
|
n_kv=4096
|
||||||
|
verbose=0
|
||||||
|
|
||||||
|
function print_usage {
|
||||||
|
printf "Usage:\n"
|
||||||
|
printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
|
||||||
|
printf " --port: port number, default is 8888\n"
|
||||||
|
printf " --repo: path to a repo containing GGUF model files\n"
|
||||||
|
printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
|
||||||
|
printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n"
|
||||||
|
printf " --gpu-id: gpu id, default is 0\n"
|
||||||
|
printf " --n-parallel: number of parallel requests, default is 8\n"
|
||||||
|
printf " --n-kv: KV cache size, default is 4096\n"
|
||||||
|
printf " --verbose: verbose output\n\n"
|
||||||
|
printf "Example:\n\n"
|
||||||
|
printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
|
||||||
|
}
|
||||||
|
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
key="$1"
|
||||||
|
case $key in
|
||||||
|
--port)
|
||||||
|
port="$2"
|
||||||
|
shift
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--repo)
|
||||||
|
repo="$2"
|
||||||
|
shift
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--wtype)
|
||||||
|
wtype="$2"
|
||||||
|
shift
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--backend)
|
||||||
|
backend="$2"
|
||||||
|
shift
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--gpu-id)
|
||||||
|
gpu_id="$2"
|
||||||
|
shift
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--n-parallel)
|
||||||
|
n_parallel="$2"
|
||||||
|
shift
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--n-kv)
|
||||||
|
n_kv="$2"
|
||||||
|
shift
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--verbose)
|
||||||
|
verbose=1
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--help)
|
||||||
|
print_usage
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Unknown argument: $key"
|
||||||
|
print_usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# available weights types
|
||||||
|
wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
|
||||||
|
|
||||||
|
wfiles=()
|
||||||
|
for wt in "${wtypes[@]}"; do
|
||||||
|
wfiles+=("")
|
||||||
|
done
|
||||||
|
|
||||||
|
# sample repos
|
||||||
|
repos=(
|
||||||
|
"https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
|
||||||
|
)
|
||||||
|
|
||||||
|
printf "\n"
|
||||||
|
printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
|
||||||
|
printf " Based on the options that follow, the script might download a model file\n"
|
||||||
|
printf " from the internet, which can be a few GBs in size. The script will also\n"
|
||||||
|
printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n"
|
||||||
|
printf "\n"
|
||||||
|
printf " Upon success, an HTTP server will be started and it will serve the selected\n"
|
||||||
|
printf " model using llama.cpp for demonstration purposes.\n"
|
||||||
|
printf "\n"
|
||||||
|
printf " Please note:\n"
|
||||||
|
printf "\n"
|
||||||
|
printf " - All new data will be stored in the current folder\n"
|
||||||
|
printf " - The server will be listening on all network interfaces\n"
|
||||||
|
printf " - The server will run with default settings which are not always optimal\n"
|
||||||
|
printf " - Do not judge the quality of a model based on the results from this script\n"
|
||||||
|
printf " - Do not use this script to benchmark llama.cpp\n"
|
||||||
|
printf " - Do not use this script in production\n"
|
||||||
|
printf " - This script is only for demonstration purposes\n"
|
||||||
|
printf "\n"
|
||||||
|
printf " If you don't know what you are doing, please press Ctrl-C to abort now\n"
|
||||||
|
printf "\n"
|
||||||
|
printf " Press Enter to continue ...\n\n"
|
||||||
|
|
||||||
|
read
|
||||||
|
|
||||||
|
if [[ -z "$repo" ]]; then
|
||||||
|
printf "[+] No repo provided from the command line\n"
|
||||||
|
printf " Please select a number from the list below or enter an URL:\n\n"
|
||||||
|
|
||||||
|
is=0
|
||||||
|
for r in "${repos[@]}"; do
|
||||||
|
printf " %2d) %s\n" $is "$r"
|
||||||
|
is=$((is+1))
|
||||||
|
done
|
||||||
|
|
||||||
|
# ask for repo until index of sample repo is provided or an URL
|
||||||
|
while [[ -z "$repo" ]]; do
|
||||||
|
printf "\n Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
|
||||||
|
read -p "[+] Select repo: " repo
|
||||||
|
|
||||||
|
# check if the input is a number
|
||||||
|
if [[ "$repo" =~ ^[0-9]+$ ]]; then
|
||||||
|
if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
|
||||||
|
repo="${repos[$repo]}"
|
||||||
|
else
|
||||||
|
printf "[-] Invalid repo index: %s\n" "$repo"
|
||||||
|
repo=""
|
||||||
|
fi
|
||||||
|
elif [[ "$repo" =~ ^https?:// ]]; then
|
||||||
|
repo="$repo"
|
||||||
|
else
|
||||||
|
printf "[-] Invalid repo URL: %s\n" "$repo"
|
||||||
|
repo=""
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# remove suffix
|
||||||
|
repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
|
||||||
|
|
||||||
|
printf "[+] Checking for GGUF model files in %s\n" "$repo"
|
||||||
|
|
||||||
|
# find GGUF files in the source
|
||||||
|
# TODO: better logic
|
||||||
|
model_tree="${repo%/}/tree/main"
|
||||||
|
model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
|
||||||
|
|
||||||
|
# list all files in the provided git repo
|
||||||
|
printf "[+] Model files:\n\n"
|
||||||
|
for file in $model_files; do
|
||||||
|
# determine iw by grepping the filename with wtypes
|
||||||
|
iw=-1
|
||||||
|
is=0
|
||||||
|
for wt in "${wtypes[@]}"; do
|
||||||
|
# uppercase
|
||||||
|
ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
|
||||||
|
if [[ "$ufile" =~ "$wt" ]]; then
|
||||||
|
iw=$is
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
is=$((is+1))
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ $iw -eq -1 ]]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
wfiles[$iw]="$file"
|
||||||
|
|
||||||
|
have=" "
|
||||||
|
if [[ -f "$file" ]]; then
|
||||||
|
have="*"
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf " %2d) %s %s\n" $iw "$have" "$file"
|
||||||
|
done
|
||||||
|
|
||||||
|
# ask for weights type until provided and available
|
||||||
|
while [[ -z "$wtype" ]]; do
|
||||||
|
printf "\n"
|
||||||
|
read -p "[+] Select weight type: " wtype
|
||||||
|
wfile="${wfiles[$wtype]}"
|
||||||
|
|
||||||
|
if [[ -z "$wfile" ]]; then
|
||||||
|
printf "[-] Invalid weight type: %s\n" "$wtype"
|
||||||
|
wtype=""
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
|
||||||
|
|
||||||
|
url="${repo%/}/resolve/main/$wfile"
|
||||||
|
|
||||||
|
# check file if the model has been downloaded before
|
||||||
|
chk="$wfile.chk"
|
||||||
|
|
||||||
|
# check if we should download the file
|
||||||
|
# - if $wfile does not exist
|
||||||
|
# - if $wfile exists but $chk does not exist
|
||||||
|
# - if $wfile exists and $chk exists but $wfile is newer than $chk
|
||||||
|
# TODO: better logic using git lfs info
|
||||||
|
|
||||||
|
do_download=0
|
||||||
|
|
||||||
|
if [[ ! -f "$wfile" ]]; then
|
||||||
|
do_download=1
|
||||||
|
elif [[ ! -f "$chk" ]]; then
|
||||||
|
do_download=1
|
||||||
|
elif [[ "$wfile" -nt "$chk" ]]; then
|
||||||
|
do_download=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $do_download -eq 1 ]]; then
|
||||||
|
printf "[+] Downloading weights from %s\n" "$url"
|
||||||
|
|
||||||
|
# download the weights file
|
||||||
|
curl -o "$wfile" -# -L "$url"
|
||||||
|
|
||||||
|
# create a check file if successful
|
||||||
|
if [[ $? -eq 0 ]]; then
|
||||||
|
printf "[+] Creating check file %s\n" "$chk"
|
||||||
|
touch "$chk"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
printf "[+] Using cached weights %s\n" "$wfile"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# get latest llama.cpp and build
|
||||||
|
|
||||||
|
printf "[+] Downloading latest llama.cpp\n"
|
||||||
|
|
||||||
|
llama_cpp_dir="__llama_cpp_port_${port}__"
|
||||||
|
|
||||||
|
if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
|
||||||
|
# if the dir exists and there isn't a file "__ggml_script__" in it, abort
|
||||||
|
printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
|
||||||
|
printf "[-] Please remove it and try again\n"
|
||||||
|
exit 1
|
||||||
|
elif [[ -d "$llama_cpp_dir" ]]; then
|
||||||
|
printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
|
||||||
|
printf "[+] Using cached llama.cpp\n"
|
||||||
|
|
||||||
|
cd "$llama_cpp_dir"
|
||||||
|
git reset --hard
|
||||||
|
git fetch
|
||||||
|
git checkout origin/master
|
||||||
|
|
||||||
|
cd ..
|
||||||
|
else
|
||||||
|
printf "[+] Cloning llama.cpp\n"
|
||||||
|
|
||||||
|
git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# mark that that the directory is made by this script
|
||||||
|
touch "$llama_cpp_dir/__ggml_script__"
|
||||||
|
|
||||||
|
if [[ $verbose -eq 1 ]]; then
|
||||||
|
set -x
|
||||||
|
fi
|
||||||
|
|
||||||
|
# build
|
||||||
|
cd "$llama_cpp_dir"
|
||||||
|
|
||||||
|
make clean
|
||||||
|
|
||||||
|
log="--silent"
|
||||||
|
if [[ $verbose -eq 1 ]]; then
|
||||||
|
log=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$backend" == "cuda" ]]; then
|
||||||
|
printf "[+] Building with CUDA backend\n"
|
||||||
|
LLAMA_CUBLAS=1 make -j server $log
|
||||||
|
elif [[ "$backend" == "cpu" ]]; then
|
||||||
|
printf "[+] Building with CPU backend\n"
|
||||||
|
make -j server $log
|
||||||
|
elif [[ "$backend" == "metal" ]]; then
|
||||||
|
printf "[+] Building with Metal backend\n"
|
||||||
|
make -j server $log
|
||||||
|
elif [[ "$backend" == "opencl" ]]; then
|
||||||
|
printf "[+] Building with OpenCL backend\n"
|
||||||
|
LLAMA_CLBLAST=1 make -j server $log
|
||||||
|
else
|
||||||
|
printf "[-] Unknown backend: %s\n" "$backend"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# run the server
|
||||||
|
|
||||||
|
printf "[+] Running server\n"
|
||||||
|
|
||||||
|
args=""
|
||||||
|
if [[ "$backend" == "cuda" ]]; then
|
||||||
|
export CUDA_VISIBLE_DEVICES=$gpu_id
|
||||||
|
args="-ngl 999"
|
||||||
|
elif [[ "$backend" == "cpu" ]]; then
|
||||||
|
args="-ngl 0"
|
||||||
|
elif [[ "$backend" == "metal" ]]; then
|
||||||
|
args="-ngl 999"
|
||||||
|
elif [[ "$backend" == "opencl" ]]; then
|
||||||
|
args="-ngl 999"
|
||||||
|
else
|
||||||
|
printf "[-] Unknown backend: %s\n" "$backend"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $verbose -eq 1 ]]; then
|
||||||
|
args="$args --verbose"
|
||||||
|
fi
|
||||||
|
|
||||||
|
./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
|
||||||
|
|
||||||
|
exit 0
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
#undef NDEBUG
|
#undef NDEBUG
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#if !defined(__riscv) && !defined(__s390__)
|
#if !defined(__riscv) && !defined(__s390__) && !defined(__ARM_NEON)
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#endif
|
#endif
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
|
@ -129,6 +129,13 @@ int main(int argc, char * argv[]) {
|
||||||
ggml_type type = (ggml_type) i;
|
ggml_type type = (ggml_type) i;
|
||||||
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
|
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
|
||||||
|
|
||||||
|
// deprecated - skip
|
||||||
|
if (qfns.blck_size == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("Testing %s\n", ggml_type_name((ggml_type) i));
|
||||||
|
|
||||||
if (qfns.from_float && qfns.to_float) {
|
if (qfns.from_float && qfns.to_float) {
|
||||||
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
||||||
const float max_quantization_error =
|
const float max_quantization_error =
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue