Merge branch 'master' into master-androidClblast
# Conflicts: # README.md
This commit is contained in:
commit
46cb52bfa4
54 changed files with 12254 additions and 2765 deletions
2
.flake8
Normal file
2
.flake8
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
[flake8]
|
||||||
|
max-line-length = 125
|
4
.github/workflows/build.yml
vendored
4
.github/workflows/build.yml
vendored
|
@ -10,10 +10,10 @@ on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
|
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
|
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
|
||||||
|
|
||||||
env:
|
env:
|
||||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||||
|
|
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -22,6 +22,7 @@ build-metal/
|
||||||
build-no-accel/
|
build-no-accel/
|
||||||
build-sanitize-addr/
|
build-sanitize-addr/
|
||||||
build-sanitize-thread/
|
build-sanitize-thread/
|
||||||
|
out/
|
||||||
|
|
||||||
models/*
|
models/*
|
||||||
*.bin
|
*.bin
|
||||||
|
@ -32,14 +33,18 @@ models/*
|
||||||
/result
|
/result
|
||||||
/perplexity
|
/perplexity
|
||||||
/embedding
|
/embedding
|
||||||
|
/train-text-from-scratch
|
||||||
|
/simple
|
||||||
/benchmark-matmult
|
/benchmark-matmult
|
||||||
/vdot
|
/vdot
|
||||||
|
/server
|
||||||
/Pipfile
|
/Pipfile
|
||||||
/libllama.so
|
/libllama.so
|
||||||
|
|
||||||
build-info.h
|
build-info.h
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
compile_commands.json
|
compile_commands.json
|
||||||
|
CMakeSettings.json
|
||||||
|
|
||||||
__pycache__
|
__pycache__
|
||||||
|
|
||||||
|
|
15
.pre-commit-config.yaml
Normal file
15
.pre-commit-config.yaml
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
# See https://pre-commit.com for more information
|
||||||
|
# See https://pre-commit.com/hooks.html for more hooks
|
||||||
|
exclude: prompts/.*.txt
|
||||||
|
repos:
|
||||||
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
|
rev: v3.2.0
|
||||||
|
hooks:
|
||||||
|
- id: trailing-whitespace
|
||||||
|
- id: end-of-file-fixer
|
||||||
|
- id: check-yaml
|
||||||
|
- id: check-added-large-files
|
||||||
|
- repo: https://github.com/PyCQA/flake8
|
||||||
|
rev: 6.0.0
|
||||||
|
hooks:
|
||||||
|
- id: flake8
|
|
@ -70,6 +70,8 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||||
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
|
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
|
||||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||||
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
|
||||||
|
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
|
||||||
|
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
||||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||||
option(LLAMA_METAL "llama: use Metal" OFF)
|
option(LLAMA_METAL "llama: use Metal" OFF)
|
||||||
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
||||||
|
@ -158,17 +160,64 @@ if (LLAMA_BLAS)
|
||||||
if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
|
if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
|
||||||
set(BLA_SIZEOF_INTEGER 8)
|
set(BLA_SIZEOF_INTEGER 8)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
|
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
|
||||||
find_package(BLAS)
|
find_package(BLAS)
|
||||||
|
|
||||||
if (BLAS_FOUND)
|
if (BLAS_FOUND)
|
||||||
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
|
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
|
||||||
|
|
||||||
|
if ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
|
||||||
|
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
|
||||||
|
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
|
||||||
|
find_package(PkgConfig REQUIRED)
|
||||||
|
if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
|
||||||
|
pkg_check_modules(DepBLAS REQUIRED blas)
|
||||||
|
elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
|
||||||
|
pkg_check_modules(DepBLAS REQUIRED openblas)
|
||||||
|
elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
|
||||||
|
pkg_check_modules(DepBLAS REQUIRED blis)
|
||||||
|
elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")
|
||||||
|
pkg_check_modules(DepBLAS REQUIRED blas-atlas)
|
||||||
|
elseif (${LLAMA_BLAS_VENDOR} MATCHES "FlexiBLAS")
|
||||||
|
pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
|
||||||
|
elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
|
||||||
|
# all Intel* libraries share the same include path
|
||||||
|
pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
|
||||||
|
elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
|
||||||
|
# this doesn't provide pkg-config
|
||||||
|
# suggest to assign BLAS_INCLUDE_DIRS on your own
|
||||||
|
if ("${NVHPC_VERSION}" STREQUAL "")
|
||||||
|
message(WARNING "Better to set NVHPC_VERSION")
|
||||||
|
else()
|
||||||
|
set(DepBLAS_FOUND ON)
|
||||||
|
set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
if (DepBLAS_FOUND)
|
||||||
|
set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
|
||||||
|
else()
|
||||||
|
message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
|
||||||
|
" detected by pkgconfig, trying to find cblas.h from possible paths...")
|
||||||
|
find_path(BLAS_INCLUDE_DIRS
|
||||||
|
NAMES cblas.h
|
||||||
|
HINTS
|
||||||
|
/usr/include
|
||||||
|
/usr/local/include
|
||||||
|
/usr/include/openblas
|
||||||
|
/opt/homebrew/opt/openblas/include
|
||||||
|
/usr/local/opt/openblas/include
|
||||||
|
/usr/include/x86_64-linux-gnu/openblas/include
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
|
||||||
add_compile_options(${BLAS_LINKER_FLAGS})
|
add_compile_options(${BLAS_LINKER_FLAGS})
|
||||||
add_compile_definitions(GGML_USE_OPENBLAS)
|
add_compile_definitions(GGML_USE_OPENBLAS)
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
||||||
|
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
|
||||||
|
|
||||||
message("${BLAS_LIBRARIES} ${BLAS_INCLUDE_DIRS}")
|
|
||||||
include_directories(${BLAS_INCLUDE_DIRS})
|
|
||||||
else()
|
else()
|
||||||
message(WARNING "BLAS not found, please refer to "
|
message(WARNING "BLAS not found, please refer to "
|
||||||
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
|
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
|
||||||
|
@ -190,6 +239,10 @@ if (LLAMA_CUBLAS)
|
||||||
add_compile_definitions(GGML_USE_CUBLAS)
|
add_compile_definitions(GGML_USE_CUBLAS)
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
|
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
|
||||||
|
if (LLAMA_CUDA_DMMV_F16)
|
||||||
|
add_compile_definitions(GGML_CUDA_DMMV_F16)
|
||||||
|
endif()
|
||||||
|
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||||
|
|
||||||
if (LLAMA_STATIC)
|
if (LLAMA_STATIC)
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||||
|
@ -197,6 +250,15 @@ if (LLAMA_CUBLAS)
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||||
|
if (LLAMA_CUDA_DMMV_F16)
|
||||||
|
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_ARCHITECTURES "52") # lowest CUDA 12 standard
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||||
|
|
||||||
else()
|
else()
|
||||||
message(WARNING "cuBLAS not found")
|
message(WARNING "cuBLAS not found")
|
||||||
endif()
|
endif()
|
||||||
|
@ -408,12 +470,15 @@ add_library(ggml OBJECT
|
||||||
${GGML_SOURCES_EXTRA}
|
${GGML_SOURCES_EXTRA}
|
||||||
)
|
)
|
||||||
|
|
||||||
target_include_directories(ggml PUBLIC .)
|
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
||||||
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
||||||
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
||||||
|
|
||||||
|
add_library(ggml_static STATIC $<TARGET_OBJECTS:ggml>)
|
||||||
if (BUILD_SHARED_LIBS)
|
if (BUILD_SHARED_LIBS)
|
||||||
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
|
||||||
|
target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_library(llama
|
add_library(llama
|
||||||
|
@ -437,13 +502,6 @@ if (BUILD_SHARED_LIBS)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_SOURCES_CUDA)
|
|
||||||
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
|
|
||||||
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
|
|
||||||
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
|
|
||||||
set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# programs, examples and tests
|
# programs, examples and tests
|
||||||
|
|
27
Makefile
27
Makefile
|
@ -1,8 +1,10 @@
|
||||||
# Define the default target now so that it is always the first target
|
# Define the default target now so that it is always the first target
|
||||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot
|
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple
|
||||||
|
|
||||||
ifdef LLAMA_BUILD_SERVER
|
ifdef LLAMA_BUILD_SERVER
|
||||||
BUILD_TARGETS += server
|
BUILD_TARGETS += server
|
||||||
|
LLAMA_SERVER_VERBOSE ?= 1
|
||||||
|
server: private CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
default: $(BUILD_TARGETS)
|
default: $(BUILD_TARGETS)
|
||||||
|
@ -127,6 +129,7 @@ endif
|
||||||
|
|
||||||
ifndef LLAMA_NO_K_QUANTS
|
ifndef LLAMA_NO_K_QUANTS
|
||||||
CFLAGS += -DGGML_USE_K_QUANTS
|
CFLAGS += -DGGML_USE_K_QUANTS
|
||||||
|
CXXFLAGS += -DGGML_USE_K_QUANTS
|
||||||
OBJS += k_quants.o
|
OBJS += k_quants.o
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -141,11 +144,7 @@ endif # LLAMA_NO_ACCELERATE
|
||||||
|
|
||||||
ifdef LLAMA_OPENBLAS
|
ifdef LLAMA_OPENBLAS
|
||||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
|
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
|
||||||
ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),)
|
|
||||||
LDFLAGS += -lopenblas -lcblas
|
|
||||||
else
|
|
||||||
LDFLAGS += -lopenblas
|
LDFLAGS += -lopenblas
|
||||||
endif
|
|
||||||
endif # LLAMA_OPENBLAS
|
endif # LLAMA_OPENBLAS
|
||||||
|
|
||||||
ifdef LLAMA_BLIS
|
ifdef LLAMA_BLIS
|
||||||
|
@ -170,6 +169,14 @@ ifdef LLAMA_CUDA_DMMV_Y
|
||||||
else
|
else
|
||||||
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
|
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
|
||||||
endif # LLAMA_CUDA_DMMV_Y
|
endif # LLAMA_CUDA_DMMV_Y
|
||||||
|
ifdef LLAMA_CUDA_DMMV_F16
|
||||||
|
NVCCFLAGS += -DGGML_CUDA_DMMV_F16
|
||||||
|
endif # LLAMA_CUDA_DMMV_F16
|
||||||
|
ifdef LLAMA_CUDA_KQUANTS_ITER
|
||||||
|
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
||||||
|
else
|
||||||
|
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
||||||
|
endif
|
||||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||||
endif # LLAMA_CUBLAS
|
endif # LLAMA_CUBLAS
|
||||||
|
@ -248,7 +255,7 @@ $(info )
|
||||||
ggml.o: ggml.c ggml.h ggml-cuda.h
|
ggml.o: ggml.c ggml.h ggml-cuda.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
|
llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
common.o: examples/common.cpp examples/common.h
|
common.o: examples/common.cpp examples/common.h
|
||||||
|
@ -258,7 +265,7 @@ libllama.so: llama.o ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h
|
rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch build-info.h
|
||||||
|
|
||||||
#
|
#
|
||||||
# Examples
|
# Examples
|
||||||
|
@ -270,6 +277,9 @@ main: examples/main/main.cpp build-info.h ggml.
|
||||||
@echo '==== Run ./main -h for help. ===='
|
@echo '==== Run ./main -h for help. ===='
|
||||||
@echo
|
@echo
|
||||||
|
|
||||||
|
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
|
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
@ -288,6 +298,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
|
||||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
|
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||||
@sh scripts/build-info.sh > $@.tmp
|
@sh scripts/build-info.sh > $@.tmp
|
||||||
@if ! cmp -s $@.tmp $@; then \
|
@if ! cmp -s $@.tmp $@; then \
|
||||||
|
|
|
@ -11,6 +11,7 @@ let package = Package(
|
||||||
.target(
|
.target(
|
||||||
name: "llama",
|
name: "llama",
|
||||||
path: ".",
|
path: ".",
|
||||||
|
exclude: ["ggml-metal.metal"],
|
||||||
sources: ["ggml.c", "llama.cpp"],
|
sources: ["ggml.c", "llama.cpp"],
|
||||||
publicHeadersPath: "spm-headers",
|
publicHeadersPath: "spm-headers",
|
||||||
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
|
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
|
||||||
|
|
45
README.md
45
README.md
|
@ -5,16 +5,15 @@
|
||||||
[](https://github.com/ggerganov/llama.cpp/actions)
|
[](https://github.com/ggerganov/llama.cpp/actions)
|
||||||
[](https://opensource.org/licenses/MIT)
|
[](https://opensource.org/licenses/MIT)
|
||||||
|
|
||||||
|
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||||
|
|
||||||
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||||
|
|
||||||
**Hot topics:**
|
**Hot topics:**
|
||||||
|
|
||||||
- Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729
|
- New roadmap: https://github.com/users/ggerganov/projects/7
|
||||||
- GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642
|
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
|
||||||
- High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684
|
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
|
||||||
- Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607
|
|
||||||
- Training LLaMA models from scratch: https://github.com/ggerganov/llama.cpp/pull/1652
|
|
||||||
- CPU threading improvements: https://github.com/ggerganov/llama.cpp/pull/1632
|
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>Table of Contents</summary>
|
<summary>Table of Contents</summary>
|
||||||
|
@ -33,6 +32,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||||
<li><a href="#quantization">Quantization</a></li>
|
<li><a href="#quantization">Quantization</a></li>
|
||||||
<li><a href="#interactive-mode">Interactive mode</a></li>
|
<li><a href="#interactive-mode">Interactive mode</a></li>
|
||||||
<li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
|
<li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
|
||||||
|
<li><a href="#using-openllama">Using OpenLLaMA</a></li>
|
||||||
<li><a href="#using-gpt4all">Using GPT4All</a></li>
|
<li><a href="#using-gpt4all">Using GPT4All</a></li>
|
||||||
<li><a href="#using-pygmalion-7b--metharme-7b">Using Pygmalion 7B & Metharme 7B</a></li>
|
<li><a href="#using-pygmalion-7b--metharme-7b">Using Pygmalion 7B & Metharme 7B</a></li>
|
||||||
<li><a href="#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data">Obtaining the Facebook LLaMA original model and Stanford Alpaca model data</a></li>
|
<li><a href="#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data">Obtaining the Facebook LLaMA original model and Stanford Alpaca model data</a></li>
|
||||||
|
@ -336,9 +336,15 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
cmake .. -DLLAMA_CUBLAS=ON
|
cmake .. -DLLAMA_CUBLAS=ON
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
```
|
```
|
||||||
Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.
|
|
||||||
|
|
||||||
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
|
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
|
||||||
|
|
||||||
|
| Option | Legal values | Default | Description |
|
||||||
|
|-------------------------|------------------------|---------|-------------|
|
||||||
|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
|
| LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
||||||
|
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
|
||||||
|
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
|
|
||||||
- #### CLBlast
|
- #### CLBlast
|
||||||
|
|
||||||
|
@ -372,7 +378,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
```sh
|
```sh
|
||||||
git clone https://github.com/CNugteren/CLBlast.git
|
git clone https://github.com/CNugteren/CLBlast.git
|
||||||
mkdir CLBlast/build
|
mkdir CLBlast/build
|
||||||
cd CLBLast/build
|
cd CLBlast/build
|
||||||
cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
|
cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
cmake --install . --prefix /some/path
|
cmake --install . --prefix /some/path
|
||||||
|
@ -541,6 +547,13 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
|
||||||
>
|
>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Using [OpenLLaMA](https://github.com/openlm-research/open_llama)
|
||||||
|
|
||||||
|
OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights.
|
||||||
|
|
||||||
|
- Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face.
|
||||||
|
- Convert the model to ggml FP16 format using `python convert.py <path to OpenLLaMA directory>`
|
||||||
|
|
||||||
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
|
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
|
||||||
|
|
||||||
- Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
|
- Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
|
||||||
|
@ -618,7 +631,12 @@ And after 4.45 hours, you will have the final perplexity.
|
||||||
|
|
||||||
#### Building the Project using Android NDK
|
#### Building the Project using Android NDK
|
||||||
You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
|
You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
|
||||||
First, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
|
|
||||||
|
First, install the essential packages for termux:
|
||||||
|
```
|
||||||
|
pkg install clang wget git cmake
|
||||||
|
```
|
||||||
|
Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
|
||||||
```
|
```
|
||||||
$ mkdir build-android
|
$ mkdir build-android
|
||||||
$ cd build-android
|
$ cd build-android
|
||||||
|
@ -665,12 +683,15 @@ Upon completion of the aforementioned steps, you will have successfully compiled
|
||||||
```
|
```
|
||||||
GGML_OPENCL_PLATFORM=0
|
GGML_OPENCL_PLATFORM=0
|
||||||
GGML_OPENCL_DEVICE=0
|
GGML_OPENCL_DEVICE=0
|
||||||
export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH
|
export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
|
||||||
./main (...)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
(Note: some Android devices, like the Zenfone 8, need the following command instead - "export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH" instead. Source: https://www.reddit.com/r/termux/comments/kc3ynp/opencl_working_in_termux_more_in_comments/ )
|
||||||
|
|
||||||
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
|
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
|
||||||
|
|
||||||
|
Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.
|
||||||
|
|
||||||
### Docker
|
### Docker
|
||||||
|
|
||||||
#### Prerequisites
|
#### Prerequisites
|
||||||
|
|
79
build.zig
79
build.zig
|
@ -1,61 +1,58 @@
|
||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
|
|
||||||
|
// Zig Version: 0.11.0-dev.3379+629f0d23b
|
||||||
pub fn build(b: *std.build.Builder) void {
|
pub fn build(b: *std.build.Builder) void {
|
||||||
const target = b.standardTargetOptions(.{});
|
const target = b.standardTargetOptions(.{});
|
||||||
const optimize = b.standardReleaseOptions();
|
const optimize = b.standardOptimizeOption(.{});
|
||||||
const want_lto = b.option(bool, "lto", "Want -fLTO");
|
const lib = b.addStaticLibrary(.{
|
||||||
|
.name = "llama",
|
||||||
const lib = b.addStaticLibrary("llama", null);
|
.target = target,
|
||||||
lib.want_lto = want_lto;
|
.optimize = optimize,
|
||||||
lib.setTarget(target);
|
});
|
||||||
lib.setBuildMode(optimize);
|
lib.linkLibC();
|
||||||
lib.linkLibCpp();
|
lib.linkLibCpp();
|
||||||
lib.addIncludePath(".");
|
lib.addIncludePath(".");
|
||||||
lib.addIncludePath("examples");
|
lib.addIncludePath("./examples");
|
||||||
lib.addCSourceFiles(&.{
|
lib.addCSourceFiles(&.{
|
||||||
"ggml.c",
|
"ggml.c",
|
||||||
}, &.{"-std=c11"});
|
}, &.{"-std=c11"});
|
||||||
lib.addCSourceFiles(&.{
|
lib.addCSourceFiles(&.{
|
||||||
"llama.cpp",
|
"llama.cpp",
|
||||||
}, &.{"-std=c++11"});
|
}, &.{"-std=c++11"});
|
||||||
lib.install();
|
b.installArtifact(lib);
|
||||||
|
|
||||||
const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };
|
const examples = .{
|
||||||
|
"main",
|
||||||
|
"baby-llama",
|
||||||
|
"embedding",
|
||||||
|
// "metal",
|
||||||
|
"perplexity",
|
||||||
|
"quantize",
|
||||||
|
"quantize-stats",
|
||||||
|
"save-load-state",
|
||||||
|
// "server",
|
||||||
|
"simple",
|
||||||
|
"train-text-from-scratch",
|
||||||
|
};
|
||||||
|
|
||||||
const exe = build_example("main", build_args);
|
inline for (examples) |example_name| {
|
||||||
_ = build_example("quantize", build_args);
|
const exe = b.addExecutable(.{
|
||||||
_ = build_example("perplexity", build_args);
|
.name = example_name,
|
||||||
_ = build_example("embedding", build_args);
|
.target = target,
|
||||||
|
.optimize = optimize,
|
||||||
// create "zig build run" command for ./main
|
});
|
||||||
|
|
||||||
const run_cmd = exe.run();
|
|
||||||
run_cmd.step.dependOn(b.getInstallStep());
|
|
||||||
if (b.args) |args| {
|
|
||||||
run_cmd.addArgs(args);
|
|
||||||
}
|
|
||||||
|
|
||||||
const run_step = b.step("run", "Run the app");
|
|
||||||
run_step.dependOn(&run_cmd.step);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
|
|
||||||
const b = args.b;
|
|
||||||
const lib = args.lib;
|
|
||||||
const want_lto = args.want_lto;
|
|
||||||
|
|
||||||
const exe = b.addExecutable(name, null);
|
|
||||||
exe.want_lto = want_lto;
|
|
||||||
lib.setTarget(args.target);
|
|
||||||
lib.setBuildMode(args.optimize);
|
|
||||||
exe.addIncludePath(".");
|
exe.addIncludePath(".");
|
||||||
exe.addIncludePath("examples");
|
exe.addIncludePath("./examples");
|
||||||
exe.addCSourceFiles(&.{
|
exe.addCSourceFiles(&.{
|
||||||
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
|
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}),
|
||||||
"examples/common.cpp",
|
"examples/common.cpp",
|
||||||
}, &.{"-std=c++11"});
|
}, &.{"-std=c++11"});
|
||||||
exe.linkLibrary(lib);
|
exe.linkLibrary(lib);
|
||||||
exe.install();
|
b.installArtifact(exe);
|
||||||
|
const run_cmd = b.addRunArtifact(exe);
|
||||||
return exe;
|
run_cmd.step.dependOn(b.getInstallStep());
|
||||||
|
if (b.args) |args| run_cmd.addArgs(args);
|
||||||
|
const run_step = b.step("run_" ++ example_name, "Run the app");
|
||||||
|
run_step.dependOn(&run_cmd.step);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
115
convert.py
115
convert.py
|
@ -130,6 +130,14 @@ TENSORS_LIST = make_tensors_list()
|
||||||
TENSORS_SET = set(TENSORS_LIST)
|
TENSORS_SET = set(TENSORS_LIST)
|
||||||
|
|
||||||
|
|
||||||
|
def find_n_mult(n_ff: int, n_embd: int) -> int:
|
||||||
|
# hardcoded magic range
|
||||||
|
for n_mult in range(256, 1, -1):
|
||||||
|
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
|
||||||
|
if calc_ff == n_ff:
|
||||||
|
return n_mult
|
||||||
|
return 1
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Params:
|
class Params:
|
||||||
n_vocab: int
|
n_vocab: int
|
||||||
|
@ -137,21 +145,61 @@ class Params:
|
||||||
n_mult: int
|
n_mult: int
|
||||||
n_head: int
|
n_head: int
|
||||||
n_layer: int
|
n_layer: int
|
||||||
file_type: GGMLFileType
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def guessed(model: 'LazyModel', file_type: GGMLFileType) -> 'Params':
|
def guessed(model: 'LazyModel') -> 'Params':
|
||||||
n_vocab, n_embd = model["tok_embeddings.weight"].shape
|
# try transformer naming first
|
||||||
|
n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
|
||||||
|
|
||||||
|
# try transformer naming first
|
||||||
|
if "model.layers.0.self_attn.q_proj.weight" in model:
|
||||||
|
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
||||||
|
else:
|
||||||
|
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
||||||
|
|
||||||
|
n_head=n_embd // 128 # guessed
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab=n_vocab,
|
n_vocab=n_vocab,
|
||||||
n_embd=n_embd,
|
n_embd=n_embd,
|
||||||
n_mult=256,
|
n_mult=256,
|
||||||
n_head=n_embd // 128,
|
n_head=n_head,
|
||||||
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model),
|
n_layer=n_layer,
|
||||||
file_type=file_type,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
|
||||||
|
config = json.load(open(config_path))
|
||||||
|
|
||||||
|
n_vocab = config["vocab_size"];
|
||||||
|
n_embd = config["hidden_size"];
|
||||||
|
n_head = config["num_attention_heads"];
|
||||||
|
n_layer = config["num_hidden_layers"];
|
||||||
|
n_ff = config["intermediate_size"];
|
||||||
|
|
||||||
|
n_mult = find_n_mult(n_ff, n_embd);
|
||||||
|
|
||||||
|
return Params(
|
||||||
|
n_vocab=n_vocab,
|
||||||
|
n_embd=n_embd,
|
||||||
|
n_mult=n_mult,
|
||||||
|
n_head=n_head,
|
||||||
|
n_layer=n_layer,
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load(model_plus: 'ModelPlus') -> 'Params':
|
||||||
|
orig_config_path = model_plus.paths[0].parent / "params.json"
|
||||||
|
hf_transformer_config_path = model_plus.paths[0].parent / "config.json"
|
||||||
|
|
||||||
|
if hf_transformer_config_path.exists():
|
||||||
|
params = Params.loadHFTransformerJson(model_plus.model, hf_transformer_config_path)
|
||||||
|
else:
|
||||||
|
params = Params.guessed(model_plus.model)
|
||||||
|
|
||||||
|
print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}')
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
class SentencePieceVocab:
|
class SentencePieceVocab:
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
|
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
|
||||||
|
@ -512,7 +560,11 @@ class LazyTensor:
|
||||||
if not isinstance(self.data_type, QuantizedDataType):
|
if not isinstance(self.data_type, QuantizedDataType):
|
||||||
raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})")
|
raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})")
|
||||||
if self.data_type.have_g_idx:
|
if self.data_type.have_g_idx:
|
||||||
sys.stderr.write("Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), which is not yet natively supported by GGML. For now you can still convert this model by passing `--outtype f16` to dequantize, but that will result in a much larger output file for no quality benefit.\n")
|
sys.stderr.write(
|
||||||
|
"Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), "
|
||||||
|
"which is not yet natively supported by GGML. "
|
||||||
|
"For now you can still convert this model by passing `--outtype f16` to dequantize, "
|
||||||
|
"but that will result in a much larger output file for no quality benefit.\n")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends
|
assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends
|
||||||
|
|
||||||
|
@ -591,18 +643,17 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
|
||||||
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
|
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
|
||||||
|
|
||||||
|
|
||||||
def convert_transformers_to_orig(model: LazyModel) -> LazyModel:
|
def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
|
||||||
out: LazyModel = {}
|
out: LazyModel = {}
|
||||||
out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
|
out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
|
||||||
out["norm.weight"] = model["model.norm.weight"]
|
out["norm.weight"] = model["model.norm.weight"]
|
||||||
out["output.weight"] = model["lm_head.weight"]
|
out["output.weight"] = model["lm_head.weight"]
|
||||||
|
|
||||||
n_head = model["model.layers.0.self_attn.q_proj.weight"].shape[1] // 128
|
|
||||||
for i in itertools.count():
|
for i in itertools.count():
|
||||||
if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
|
if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
|
||||||
break
|
break
|
||||||
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], n_head)
|
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
|
||||||
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], n_head)
|
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
|
||||||
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
||||||
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
|
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
|
||||||
|
|
||||||
|
@ -695,7 +746,8 @@ class LazyUnpickler(pickle.Unpickler):
|
||||||
return LazyStorage(load=load, kind=pid[1], description=description)
|
return LazyStorage(load=load, kind=pid[1], description=description)
|
||||||
|
|
||||||
# @staticmethod
|
# @staticmethod
|
||||||
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName]
|
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
|
||||||
|
# pyright: ignore[reportSelfClsParameterName]
|
||||||
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
|
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
|
||||||
assert isinstance(storage, LazyStorage)
|
assert isinstance(storage, LazyStorage)
|
||||||
|
|
||||||
|
@ -915,7 +967,7 @@ class OutputFile:
|
||||||
def __init__(self, fname_out: Path) -> None:
|
def __init__(self, fname_out: Path) -> None:
|
||||||
self.fout = open(fname_out, "wb")
|
self.fout = open(fname_out, "wb")
|
||||||
|
|
||||||
def write_file_header(self, params: Params) -> None:
|
def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
|
||||||
self.fout.write(b"ggjt"[::-1]) # magic
|
self.fout.write(b"ggjt"[::-1]) # magic
|
||||||
values = [
|
values = [
|
||||||
1, # file version
|
1, # file version
|
||||||
|
@ -925,7 +977,7 @@ class OutputFile:
|
||||||
params.n_head,
|
params.n_head,
|
||||||
params.n_layer,
|
params.n_layer,
|
||||||
params.n_embd // params.n_head, # rot (obsolete)
|
params.n_embd // params.n_head, # rot (obsolete)
|
||||||
params.file_type.value,
|
file_type.value,
|
||||||
]
|
]
|
||||||
self.fout.write(struct.pack("i" * len(values), *values))
|
self.fout.write(struct.pack("i" * len(values), *values))
|
||||||
|
|
||||||
|
@ -946,17 +998,17 @@ class OutputFile:
|
||||||
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
|
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
|
||||||
of = OutputFile(fname_out)
|
of = OutputFile(fname_out)
|
||||||
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0,
|
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0,
|
||||||
n_head=1, n_layer=0, file_type=GGMLFileType.AllF32)
|
n_head=1, n_layer=0)
|
||||||
of = OutputFile(fname_out)
|
of = OutputFile(fname_out)
|
||||||
of.write_file_header(params)
|
of.write_file_header(params, file_type=GGMLFileType.AllF32)
|
||||||
of.write_vocab(vocab)
|
of.write_vocab(vocab)
|
||||||
of.fout.close()
|
of.fout.close()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
|
def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None:
|
||||||
check_vocab_size(params, vocab)
|
check_vocab_size(params, vocab)
|
||||||
of = OutputFile(fname_out)
|
of = OutputFile(fname_out)
|
||||||
of.write_file_header(params)
|
of.write_file_header(params, file_type)
|
||||||
print("Writing vocab...")
|
print("Writing vocab...")
|
||||||
of.write_vocab(vocab)
|
of.write_vocab(vocab)
|
||||||
|
|
||||||
|
@ -992,11 +1044,11 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi
|
||||||
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
||||||
|
|
||||||
|
|
||||||
def do_necessary_conversions(model: LazyModel) -> LazyModel:
|
def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
|
||||||
model = handle_quantization(model)
|
model = handle_quantization(model)
|
||||||
|
|
||||||
if "lm_head.weight" in model:
|
if "lm_head.weight" in model:
|
||||||
model = convert_transformers_to_orig(model)
|
model = convert_transformers_to_orig(model, params)
|
||||||
model = filter_and_sort_tensors(model)
|
model = filter_and_sort_tensors(model)
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
@ -1094,23 +1146,27 @@ def load_vocab(path: Path) -> SentencePieceVocab:
|
||||||
elif path3.exists():
|
elif path3.exists():
|
||||||
path = path3
|
path = path3
|
||||||
else:
|
else:
|
||||||
raise FileNotFoundError(f"Could not find tokenizer.model in {path} or its parent; if it's in another directory, pass the directory as --vocab-dir")
|
raise FileNotFoundError(
|
||||||
|
f"Could not find tokenizer.model in {path} or its parent; "
|
||||||
|
"if it's in another directory, pass the directory as --vocab-dir")
|
||||||
added_tokens_path = path.parent / "added_tokens.json"
|
added_tokens_path = path.parent / "added_tokens.json"
|
||||||
print(f"Loading vocab file {path}")
|
print(f"Loading vocab file {path}")
|
||||||
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
|
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
|
||||||
|
|
||||||
|
|
||||||
def default_outfile(model_paths: List[Path], params: Params) -> Path:
|
def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
|
||||||
namestr = {
|
namestr = {
|
||||||
GGMLFileType.AllF32: "f32",
|
GGMLFileType.AllF32: "f32",
|
||||||
GGMLFileType.MostlyF16: "f16",
|
GGMLFileType.MostlyF16: "f16",
|
||||||
GGMLFileType.MostlyQ4_0: "q4_0",
|
GGMLFileType.MostlyQ4_0: "q4_0",
|
||||||
GGMLFileType.MostlyQ4_1: "q4_1",
|
GGMLFileType.MostlyQ4_1: "q4_1",
|
||||||
GGMLFileType.PerLayerIsQ4_1: "q4_1",
|
GGMLFileType.PerLayerIsQ4_1: "q4_1",
|
||||||
}[params.file_type]
|
}[file_type]
|
||||||
ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
|
ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
|
||||||
if ret in model_paths:
|
if ret in model_paths:
|
||||||
sys.stderr.write(f"Error: Default output path ({ret}) would overwrite the input. Please explicitly specify a path using --outfile.\n")
|
sys.stderr.write(
|
||||||
|
f"Error: Default output path ({ret}) would overwrite the input. "
|
||||||
|
"Please explicitly specify a path using --outfile.\n")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
@ -1131,7 +1187,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
|
||||||
parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
|
parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
|
||||||
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
|
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
|
||||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
|
parser.add_argument("model", type=Path,
|
||||||
|
help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
|
||||||
args = parser.parse_args(args_in)
|
args = parser.parse_args(args_in)
|
||||||
|
|
||||||
vocab: Vocab
|
vocab: Vocab
|
||||||
|
@ -1154,13 +1211,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
|
||||||
else:
|
else:
|
||||||
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
||||||
vocab = load_vocab(vocab_dir)
|
vocab = load_vocab(vocab_dir)
|
||||||
|
params = Params.load(model_plus)
|
||||||
model = model_plus.model
|
model = model_plus.model
|
||||||
model = do_necessary_conversions(model)
|
model = do_necessary_conversions(model, params)
|
||||||
output_type = pick_output_type(model, args.outtype)
|
output_type = pick_output_type(model, args.outtype)
|
||||||
model = convert_to_output_type(model, output_type)
|
model = convert_to_output_type(model, output_type)
|
||||||
params = Params.guessed(model, output_type)
|
outfile = args.outfile or default_outfile(model_plus.paths, output_type)
|
||||||
outfile = args.outfile or default_outfile(model_plus.paths, params)
|
OutputFile.write_all(outfile, params, output_type, model, vocab)
|
||||||
OutputFile.write_all(outfile, params, model, vocab)
|
|
||||||
print(f"Wrote {outfile}")
|
print(f"Wrote {outfile}")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -37,6 +37,8 @@ else()
|
||||||
add_subdirectory(save-load-state)
|
add_subdirectory(save-load-state)
|
||||||
add_subdirectory(benchmark)
|
add_subdirectory(benchmark)
|
||||||
add_subdirectory(baby-llama)
|
add_subdirectory(baby-llama)
|
||||||
|
add_subdirectory(train-text-from-scratch)
|
||||||
|
add_subdirectory(simple)
|
||||||
if (LLAMA_METAL)
|
if (LLAMA_METAL)
|
||||||
add_subdirectory(metal)
|
add_subdirectory(metal)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -4,6 +4,10 @@
|
||||||
#include <random>
|
#include <random>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
float frand() {
|
float frand() {
|
||||||
return (float)rand()/(float)RAND_MAX;
|
return (float)rand()/(float)RAND_MAX;
|
||||||
}
|
}
|
||||||
|
@ -79,34 +83,39 @@ struct ggml_tensor * randomize_tensor_normal(
|
||||||
int ndims,
|
int ndims,
|
||||||
const int64_t ne[],
|
const int64_t ne[],
|
||||||
struct random_normal_distribution * rnd) {
|
struct random_normal_distribution * rnd) {
|
||||||
|
float scale = 1.0; // xavier
|
||||||
switch (ndims) {
|
switch (ndims) {
|
||||||
case 1:
|
case 1:
|
||||||
|
scale /= sqrtf(ne[0]);
|
||||||
for (int i0 = 0; i0 < ne[0]; i0++) {
|
for (int i0 = 0; i0 < ne[0]; i0++) {
|
||||||
((float *)tensor->data)[i0] = frand_normal(rnd);
|
((float *)tensor->data)[i0] = scale * frand_normal(rnd);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
|
scale /= sqrtf(ne[0]+ne[1]);
|
||||||
for (int i1 = 0; i1 < ne[1]; i1++) {
|
for (int i1 = 0; i1 < ne[1]; i1++) {
|
||||||
for (int i0 = 0; i0 < ne[0]; i0++) {
|
for (int i0 = 0; i0 < ne[0]; i0++) {
|
||||||
((float *)tensor->data)[i1*ne[0] + i0] = frand_normal(rnd);
|
((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3:
|
||||||
|
scale /= sqrtf(ne[0]+ne[1]);
|
||||||
for (int i2 = 0; i2 < ne[2]; i2++) {
|
for (int i2 = 0; i2 < ne[2]; i2++) {
|
||||||
for (int i1 = 0; i1 < ne[1]; i1++) {
|
for (int i1 = 0; i1 < ne[1]; i1++) {
|
||||||
for (int i0 = 0; i0 < ne[0]; i0++) {
|
for (int i0 = 0; i0 < ne[0]; i0++) {
|
||||||
((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd);
|
((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
|
scale /= sqrtf(ne[0]+ne[1]);
|
||||||
for (int i3 = 0; i3 < ne[3]; i3++) {
|
for (int i3 = 0; i3 < ne[3]; i3++) {
|
||||||
for (int i2 = 0; i2 < ne[2]; i2++) {
|
for (int i2 = 0; i2 < ne[2]; i2++) {
|
||||||
for (int i1 = 0; i1 < ne[1]; i1++) {
|
for (int i1 = 0; i1 < ne[1]; i1++) {
|
||||||
for (int i0 = 0; i0 < ne[0]; i0++) {
|
for (int i0 = 0; i0 < ne[0]; i0++) {
|
||||||
((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd);
|
((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -148,8 +157,8 @@ struct llama_hparams_lora {
|
||||||
uint32_t n_rot = 64;
|
uint32_t n_rot = 64;
|
||||||
uint32_t n_lora = 64;
|
uint32_t n_lora = 64;
|
||||||
|
|
||||||
bool operator!=(const llama_hparams & other) const {
|
bool operator!=(const llama_hparams_lora & other) const {
|
||||||
return memcmp(this, &other, sizeof(llama_hparams));
|
return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1465,7 +1474,7 @@ struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_te
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
|
struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
|
||||||
const float eps = 1e-3;
|
const float eps = 1e-3f;
|
||||||
return
|
return
|
||||||
ggml_sum(ctx,
|
ggml_sum(ctx,
|
||||||
ggml_neg(ctx,
|
ggml_neg(ctx,
|
||||||
|
|
|
@ -16,6 +16,10 @@
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
float tensor_sum_elements(const ggml_tensor * tensor) {
|
float tensor_sum_elements(const ggml_tensor * tensor) {
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
if (tensor->type==GGML_TYPE_F32) {
|
if (tensor->type==GGML_TYPE_F32) {
|
||||||
|
@ -29,9 +33,9 @@ float tensor_sum_elements(const ggml_tensor * tensor) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
||||||
printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", name,
|
printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
|
||||||
tensor->type, ggml_type_name(tensor->type),
|
tensor->type, ggml_type_name(tensor->type),
|
||||||
(int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
|
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
|
||||||
float sum = tensor_sum_elements(tensor);
|
float sum = tensor_sum_elements(tensor);
|
||||||
printf("Sum of tensor %s is %6.2f\n", name, sum);
|
printf("Sum of tensor %s is %6.2f\n", name, sum);
|
||||||
}
|
}
|
||||||
|
@ -120,7 +124,7 @@ int main(int argc, char ** argv) {
|
||||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
|
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
|
||||||
ctx_size += 1024*1024*16;
|
ctx_size += 1024*1024*16;
|
||||||
|
|
||||||
printf("Allocating Memory of size %li bytes, %li MB\n",ctx_size, (ctx_size/1024/1024));
|
printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
|
||||||
|
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/*.mem_size =*/ ctx_size,
|
/*.mem_size =*/ ctx_size,
|
||||||
|
|
41
examples/chat-vicuna.sh
Executable file
41
examples/chat-vicuna.sh
Executable file
|
@ -0,0 +1,41 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
cd "$(dirname "$0")/.." || exit
|
||||||
|
|
||||||
|
MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}"
|
||||||
|
PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
|
||||||
|
USER_NAME="### Human"
|
||||||
|
AI_NAME="### Assistant"
|
||||||
|
|
||||||
|
# Adjust to the number of CPU cores you want to use.
|
||||||
|
N_THREAD="${N_THREAD:-8}"
|
||||||
|
# Number of tokens to predict (made it larger than default because we want a long interaction)
|
||||||
|
N_PREDICTS="${N_PREDICTS:-2048}"
|
||||||
|
|
||||||
|
# Note: you can also override the generation options by specifying them on the command line:
|
||||||
|
# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
|
||||||
|
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
|
||||||
|
|
||||||
|
DATE_TIME=$(date +%H:%M)
|
||||||
|
DATE_YEAR=$(date +%Y)
|
||||||
|
|
||||||
|
PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
|
||||||
|
|
||||||
|
sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
|
||||||
|
-e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
|
||||||
|
-e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
|
||||||
|
-e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
|
||||||
|
$PROMPT_TEMPLATE > $PROMPT_FILE
|
||||||
|
|
||||||
|
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||||
|
./bin/main $GEN_OPTIONS \
|
||||||
|
--model "$MODEL" \
|
||||||
|
--threads "$N_THREAD" \
|
||||||
|
--n_predict "$N_PREDICTS" \
|
||||||
|
--color --interactive \
|
||||||
|
--file ${PROMPT_FILE} \
|
||||||
|
--reverse-prompt "### Human:" \
|
||||||
|
--in-prefix ' ' \
|
||||||
|
"$@"
|
|
@ -28,6 +28,10 @@
|
||||||
#include <wchar.h>
|
#include <wchar.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
int32_t get_num_physical_cores() {
|
int32_t get_num_physical_cores() {
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
// enumerate the set of thread siblings, num entries is num cores
|
// enumerate the set of thread siblings, num entries is num cores
|
||||||
|
@ -102,9 +106,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (arg == "-s" || arg == "--seed") {
|
if (arg == "-s" || arg == "--seed") {
|
||||||
#if defined(GGML_USE_CUBLAS)
|
|
||||||
fprintf(stderr, "WARNING: when using cuBLAS generation results are NOT guaranteed to be reproducible.\n");
|
|
||||||
#endif
|
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
|
@ -331,6 +332,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
} else if (arg == "--low-vram" || arg == "-lv") {
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
params.low_vram = true;
|
||||||
|
#else
|
||||||
|
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
} else if (arg == "--no-mmap") {
|
} else if (arg == "--no-mmap") {
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
|
@ -367,7 +374,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
} else {
|
} else {
|
||||||
throw std::exception();
|
throw std::exception();
|
||||||
}
|
}
|
||||||
} catch (const std::exception &e) {
|
} catch (const std::exception&) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -406,6 +413,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
gpt_print_usage(argc, argv, default_params);
|
gpt_print_usage(argc, argv, default_params);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
|
||||||
|
fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
if (escape_prompt) {
|
if (escape_prompt) {
|
||||||
process_escapes(params.prompt);
|
process_escapes(params.prompt);
|
||||||
}
|
}
|
||||||
|
@ -479,6 +494,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
|
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
|
||||||
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
|
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
|
||||||
|
fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
|
||||||
#endif
|
#endif
|
||||||
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
||||||
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
|
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
|
||||||
|
@ -520,7 +536,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
|
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
|
||||||
auto lparams = llama_context_default_params();
|
auto lparams = llama_context_default_params();
|
||||||
|
|
||||||
lparams.n_ctx = params.n_ctx;
|
lparams.n_ctx = params.n_ctx;
|
||||||
|
@ -528,6 +544,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
|
||||||
lparams.n_gpu_layers = params.n_gpu_layers;
|
lparams.n_gpu_layers = params.n_gpu_layers;
|
||||||
lparams.main_gpu = params.main_gpu;
|
lparams.main_gpu = params.main_gpu;
|
||||||
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
|
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
|
||||||
|
lparams.low_vram = params.low_vram;
|
||||||
lparams.seed = params.seed;
|
lparams.seed = params.seed;
|
||||||
lparams.f16_kv = params.memory_f16;
|
lparams.f16_kv = params.memory_f16;
|
||||||
lparams.use_mmap = params.use_mmap;
|
lparams.use_mmap = params.use_mmap;
|
||||||
|
@ -535,25 +552,33 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
|
||||||
lparams.logits_all = params.perplexity;
|
lparams.logits_all = params.perplexity;
|
||||||
lparams.embedding = params.embedding;
|
lparams.embedding = params.embedding;
|
||||||
|
|
||||||
llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
|
||||||
|
if (model == NULL) {
|
||||||
if (lctx == NULL) {
|
|
||||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||||
return NULL;
|
return std::make_tuple(nullptr, nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context * lctx = llama_new_context_with_model(model, lparams);
|
||||||
|
if (lctx == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||||
|
llama_free_model(model);
|
||||||
|
return std::make_tuple(nullptr, nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.lora_adapter.empty()) {
|
if (!params.lora_adapter.empty()) {
|
||||||
int err = llama_apply_lora_from_file(lctx,
|
int err = llama_model_apply_lora_from_file(model,
|
||||||
params.lora_adapter.c_str(),
|
params.lora_adapter.c_str(),
|
||||||
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
|
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
|
||||||
params.n_threads);
|
params.n_threads);
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||||
return NULL;
|
llama_free(lctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
return std::make_tuple(nullptr, nullptr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return lctx;
|
return std::make_tuple(model, lctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
void console_init(console_state & con_st) {
|
void console_init(console_state & con_st) {
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
#include <random>
|
#include <random>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
#include <tuple>
|
||||||
|
|
||||||
#if !defined (_WIN32)
|
#if !defined (_WIN32)
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
@ -30,6 +31,7 @@ struct gpt_params {
|
||||||
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||||
|
bool low_vram = 0; // if true, reduce VRAM usage at the cost of performance
|
||||||
|
|
||||||
// sampling parameters
|
// sampling parameters
|
||||||
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
||||||
|
@ -94,7 +96,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
|
||||||
// Model utils
|
// Model utils
|
||||||
//
|
//
|
||||||
|
|
||||||
struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
|
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Console utils
|
// Console utils
|
||||||
|
|
|
@ -4,6 +4,10 @@
|
||||||
|
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
|
@ -33,11 +37,12 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_init_backend();
|
llama_init_backend();
|
||||||
|
|
||||||
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
ctx = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
if (ctx == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -86,6 +91,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import sys, os
|
import os
|
||||||
import csv
|
import csv
|
||||||
|
|
||||||
labels = []
|
labels = []
|
||||||
|
@ -8,6 +8,7 @@ numEntries = 1
|
||||||
|
|
||||||
rows = []
|
rows = []
|
||||||
|
|
||||||
|
|
||||||
def bar_chart(numbers, labels, pos):
|
def bar_chart(numbers, labels, pos):
|
||||||
plt.bar(pos, numbers, color='blue')
|
plt.bar(pos, numbers, color='blue')
|
||||||
plt.xticks(ticks=pos, labels=labels)
|
plt.xticks(ticks=pos, labels=labels)
|
||||||
|
@ -16,6 +17,7 @@ def bar_chart(numbers, labels, pos):
|
||||||
plt.ylabel("Questions Correct")
|
plt.ylabel("Questions Correct")
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
def calculatecorrect():
|
def calculatecorrect():
|
||||||
directory = os.fsencode("./examples/jeopardy/results/")
|
directory = os.fsencode("./examples/jeopardy/results/")
|
||||||
csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
|
csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
|
||||||
|
@ -45,7 +47,6 @@ def calculatecorrect():
|
||||||
numbers.append(totalcorrect)
|
numbers.append(totalcorrect)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
calculatecorrect()
|
calculatecorrect()
|
||||||
pos = list(range(numEntries))
|
pos = list(range(numEntries))
|
||||||
|
|
|
@ -288,5 +288,6 @@ These options provide extra functionality and customization when running the LLa
|
||||||
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
||||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
||||||
|
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
|
||||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||||
|
|
|
@ -23,11 +23,17 @@
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#elif defined (_WIN32)
|
#elif defined (_WIN32)
|
||||||
#define WIN32_LEAN_AND_MEAN
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#ifndef NOMINMAX
|
||||||
#define NOMINMAX
|
#define NOMINMAX
|
||||||
|
#endif
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
static console_state con_st;
|
static console_state con_st;
|
||||||
static llama_context ** g_ctx;
|
static llama_context ** g_ctx;
|
||||||
|
|
||||||
|
@ -101,12 +107,13 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_init_backend();
|
llama_init_backend();
|
||||||
|
|
||||||
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
g_ctx = &ctx;
|
g_ctx = &ctx;
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
ctx = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
if (ctx == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -133,6 +140,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -141,6 +149,7 @@ int main(int argc, char ** argv) {
|
||||||
if (params.export_cgraph) {
|
if (params.export_cgraph) {
|
||||||
llama_eval_export(ctx, "llama.ggml");
|
llama_eval_export(ctx, "llama.ggml");
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -331,6 +340,13 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::vector<llama_token> embd;
|
std::vector<llama_token> embd;
|
||||||
|
|
||||||
|
// do one empty run to warm up the model
|
||||||
|
{
|
||||||
|
const std::vector<llama_token> tmp = { llama_token_bos(), };
|
||||||
|
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
|
||||||
|
llama_reset_timings(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
||||||
// predict
|
// predict
|
||||||
if (embd.size() > 0) {
|
if (embd.size() > 0) {
|
||||||
|
@ -341,7 +357,7 @@ int main(int argc, char ** argv) {
|
||||||
if ((int)embd.size() > max_embd_size) {
|
if ((int)embd.size() > max_embd_size) {
|
||||||
auto skipped_tokens = embd.size() - max_embd_size;
|
auto skipped_tokens = embd.size() - max_embd_size;
|
||||||
console_set_color(con_st, CONSOLE_COLOR_ERROR);
|
console_set_color(con_st, CONSOLE_COLOR_ERROR);
|
||||||
printf("<<input too long: skipped %ld token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
||||||
console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
|
console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
embd.resize(max_embd_size);
|
embd.resize(max_embd_size);
|
||||||
|
@ -653,6 +669,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -40,8 +40,10 @@ int main(int argc, char ** argv) {
|
||||||
// this allocates all Metal resources and memory buffers
|
// this allocates all Metal resources and memory buffers
|
||||||
auto * ctx_metal = ggml_metal_init();
|
auto * ctx_metal = ggml_metal_init();
|
||||||
|
|
||||||
ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data));
|
const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
|
||||||
ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval));
|
const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
|
||||||
|
ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data), max_size_data);
|
||||||
|
ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval), max_size_eval);
|
||||||
|
|
||||||
// main
|
// main
|
||||||
{
|
{
|
||||||
|
|
|
@ -5,6 +5,10 @@
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
std::vector<float> softmax(const std::vector<float>& logits) {
|
std::vector<float> softmax(const std::vector<float>& logits) {
|
||||||
std::vector<float> probs(logits.size());
|
std::vector<float> probs(logits.size());
|
||||||
float max_logit = logits[0];
|
float max_logit = logits[0];
|
||||||
|
@ -145,11 +149,12 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_init_backend();
|
llama_init_backend();
|
||||||
|
|
||||||
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
ctx = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
if (ctx == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -165,6 +170,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,10 @@
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
struct quantize_stats_params {
|
struct quantize_stats_params {
|
||||||
std::string model = "models/7B/ggml-model-f16.bin";
|
std::string model = "models/7B/ggml-model-f16.bin";
|
||||||
bool verbose = false;
|
bool verbose = false;
|
||||||
|
@ -316,6 +320,7 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "Loading model\n");
|
fprintf(stderr, "Loading model\n");
|
||||||
|
|
||||||
const int64_t t_main_start_us = ggml_time_us();
|
const int64_t t_main_start_us = ggml_time_us();
|
||||||
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -326,10 +331,18 @@ int main(int argc, char ** argv) {
|
||||||
lparams.f16_kv = false;
|
lparams.f16_kv = false;
|
||||||
lparams.use_mlock = false;
|
lparams.use_mlock = false;
|
||||||
|
|
||||||
ctx = llama_init_from_file(params.model.c_str(), lparams);
|
model = llama_load_model_from_file(params.model.c_str(), lparams);
|
||||||
|
|
||||||
|
if (model == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx = llama_new_context_with_model(model, lparams);
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||||
|
llama_free_model(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -353,6 +366,7 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
|
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
|
||||||
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
|
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
included_layers++;
|
included_layers++;
|
||||||
|
@ -411,6 +425,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
// report timing
|
// report timing
|
||||||
{
|
{
|
||||||
const int64_t t_main_end_us = ggml_time_us();
|
const int64_t t_main_end_us = ggml_time_us();
|
||||||
|
|
|
@ -4,43 +4,135 @@
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <map>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
|
struct quant_option {
|
||||||
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
|
std::string name;
|
||||||
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
|
llama_ftype ftype;
|
||||||
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
|
std::string desc;
|
||||||
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
|
|
||||||
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
|
|
||||||
{"q2_K", LLAMA_FTYPE_MOSTLY_Q2_K},
|
|
||||||
{"q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M},
|
|
||||||
{"q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S},
|
|
||||||
{"q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M},
|
|
||||||
{"q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L},
|
|
||||||
{"q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M},
|
|
||||||
{"q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S},
|
|
||||||
{"q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M},
|
|
||||||
{"q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M},
|
|
||||||
{"q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S},
|
|
||||||
{"q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M},
|
|
||||||
{"q6_K", LLAMA_FTYPE_MOSTLY_Q6_K},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) {
|
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||||
auto it = LLAMA_FTYPE_MAP.find(ftype_str);
|
{
|
||||||
if (it != LLAMA_FTYPE_MAP.end()) {
|
"Q4_0",
|
||||||
ftype = it->second;
|
LLAMA_FTYPE_MOSTLY_Q4_0,
|
||||||
ftype_str_out = it->first;
|
" 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Q4_1",
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q4_1,
|
||||||
|
" 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Q5_0",
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q5_0,
|
||||||
|
" 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Q5_1",
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q5_1,
|
||||||
|
" 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M",
|
||||||
|
},
|
||||||
|
#ifdef GGML_USE_K_QUANTS
|
||||||
|
{
|
||||||
|
"Q2_K",
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q2_K,
|
||||||
|
" 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Q3_K",
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q3_K_M,
|
||||||
|
"alias for Q3_K_M"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Q3_K_S",
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q3_K_S,
|
||||||
|
" 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Q3_K_M",
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q3_K_M,
|
||||||
|
" 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Q3_K_L",
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q3_K_L,
|
||||||
|
" 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Q4_K",
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q4_K_M,
|
||||||
|
"alias for Q4_K_M",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Q4_K_S",
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q4_K_S,
|
||||||
|
" 3.56G, +0.1149 ppl @ 7B - small, significant quality loss",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Q4_K_M",
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q4_K_M,
|
||||||
|
" 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Q5_K",
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q5_K_M,
|
||||||
|
"alias for Q5_K_M",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Q5_K_S",
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q5_K_S,
|
||||||
|
" 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Q5_K_M",
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q5_K_M,
|
||||||
|
" 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Q6_K",
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q6_K,
|
||||||
|
" 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss",
|
||||||
|
},
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
"Q8_0",
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q8_0,
|
||||||
|
" 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"F16",
|
||||||
|
LLAMA_FTYPE_MOSTLY_F16,
|
||||||
|
"13.00G @ 7B - extremely large, virtually no quality loss - not recommended",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"F32",
|
||||||
|
LLAMA_FTYPE_ALL_F32,
|
||||||
|
"26.00G @ 7B - absolutely huge, lossless - not recommended",
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
|
||||||
|
std::string ftype_str;
|
||||||
|
|
||||||
|
for (auto ch : ftype_str_in) {
|
||||||
|
ftype_str.push_back(std::toupper(ch));
|
||||||
|
}
|
||||||
|
for (auto & it : QUANT_OPTIONS) {
|
||||||
|
if (it.name == ftype_str) {
|
||||||
|
ftype = it.ftype;
|
||||||
|
ftype_str_out = it.name;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
// try to parse as an integer
|
}
|
||||||
try {
|
try {
|
||||||
int ftype_int = std::stoi(ftype_str);
|
int ftype_int = std::stoi(ftype_str);
|
||||||
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
|
for (auto & it : QUANT_OPTIONS) {
|
||||||
if (it->second == ftype_int) {
|
if (it.ftype == ftype_int) {
|
||||||
ftype = it->second;
|
ftype = it.ftype;
|
||||||
ftype_str_out = it->first;
|
ftype_str_out = it.name;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -52,15 +144,15 @@ bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::st
|
||||||
}
|
}
|
||||||
|
|
||||||
// usage:
|
// usage:
|
||||||
// ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
|
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
|
||||||
//
|
//
|
||||||
void usage(const char * executable) {
|
void usage(const char * executable) {
|
||||||
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n", executable);
|
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
|
||||||
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||||
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||||
fprintf(stderr, "Allowed quantization types:\n");
|
fprintf(stderr, "\nAllowed quantization types:\n");
|
||||||
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
|
for (auto & it : QUANT_OPTIONS) {
|
||||||
fprintf(stderr, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
|
printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
|
||||||
}
|
}
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,12 +35,22 @@ int main(int argc, char ** argv) {
|
||||||
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
|
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
|
||||||
|
|
||||||
// init
|
// init
|
||||||
auto ctx = llama_init_from_file(params.model.c_str(), lparams);
|
auto model = llama_load_model_from_file(params.model.c_str(), lparams);
|
||||||
|
if (model == nullptr) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
auto ctx = llama_new_context_with_model(model, lparams);
|
||||||
|
if (ctx == nullptr) {
|
||||||
|
llama_free_model(model);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
auto tokens = std::vector<llama_token>(params.n_ctx);
|
auto tokens = std::vector<llama_token>(params.n_ctx);
|
||||||
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true);
|
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
|
||||||
|
|
||||||
if (n_prompt_tokens < 1) {
|
if (n_prompt_tokens < 1) {
|
||||||
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
|
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -84,6 +94,8 @@ int main(int argc, char ** argv) {
|
||||||
printf("%s", next_token_str);
|
printf("%s", next_token_str);
|
||||||
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
|
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
|
||||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
n_past += 1;
|
n_past += 1;
|
||||||
|
@ -91,23 +103,27 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
printf("\n\n");
|
printf("\n\n");
|
||||||
|
|
||||||
// free old model
|
// free old context
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
||||||
// load new model
|
// make new context
|
||||||
auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
|
auto ctx2 = llama_new_context_with_model(model, lparams);
|
||||||
|
|
||||||
// Load state (rng, logits, embedding and kv_cache) from file
|
// Load state (rng, logits, embedding and kv_cache) from file
|
||||||
{
|
{
|
||||||
FILE *fp_read = fopen("dump_state.bin", "rb");
|
FILE *fp_read = fopen("dump_state.bin", "rb");
|
||||||
if (state_size != llama_get_state_size(ctx2)) {
|
if (state_size != llama_get_state_size(ctx2)) {
|
||||||
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
|
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
|
||||||
|
llama_free(ctx2);
|
||||||
|
llama_free_model(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t ret = fread(state_mem, 1, state_size, fp_read);
|
const size_t ret = fread(state_mem, 1, state_size, fp_read);
|
||||||
if (ret != state_size) {
|
if (ret != state_size) {
|
||||||
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
||||||
|
llama_free(ctx2);
|
||||||
|
llama_free_model(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -138,6 +154,8 @@ int main(int argc, char ** argv) {
|
||||||
printf("%s", next_token_str);
|
printf("%s", next_token_str);
|
||||||
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
|
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
|
||||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||||
|
llama_free(ctx2);
|
||||||
|
llama_free_model(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
n_past += 1;
|
n_past += 1;
|
||||||
|
@ -145,5 +163,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
printf("\n\n");
|
printf("\n\n");
|
||||||
|
|
||||||
|
llama_free(ctx2);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,10 @@
|
||||||
set(TARGET server)
|
set(TARGET server)
|
||||||
|
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
add_executable(${TARGET} server.cpp json.hpp httplib.h)
|
add_executable(${TARGET} server.cpp json.hpp httplib.h)
|
||||||
|
target_compile_definitions(${TARGET} PRIVATE
|
||||||
|
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||||
|
)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
if(TARGET BUILD_INFO)
|
||||||
|
|
|
@ -1,33 +1,75 @@
|
||||||
# llama.cpp/example/server
|
# llama.cpp/example/server
|
||||||
|
|
||||||
This example allow you to have a llama.cpp http server to interact from a web page or consume the API.
|
This example demonstrates a simple HTTP API server to interact with llama.cpp.
|
||||||
|
|
||||||
## Table of Contents
|
Command line options:
|
||||||
|
|
||||||
1. [Quick Start](#quick-start)
|
- `--threads N`, `-t N`: Set the number of threads to use during computation.
|
||||||
2. [Node JS Test](#node-js-test)
|
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||||
3. [API Endpoints](#api-endpoints)
|
- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
||||||
4. [More examples](#more-examples)
|
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
||||||
5. [Common Options](#common-options)
|
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
6. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
|
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
||||||
|
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
||||||
|
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
|
||||||
|
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
|
||||||
|
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
|
||||||
|
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
|
||||||
|
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
|
||||||
|
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||||
|
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||||
|
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
|
||||||
|
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
|
||||||
|
- `--port`: Set the port to listen. Default: `8080`.
|
||||||
|
- `--embedding`: Enable embedding extraction, Default: disabled.
|
||||||
|
|
||||||
|
## Build
|
||||||
|
|
||||||
|
Build llama.cpp with server from repository root with either make or CMake.
|
||||||
|
|
||||||
|
- Using `make`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
LLAMA_BUILD_SERVER=1 make
|
||||||
|
```
|
||||||
|
|
||||||
|
- Using `CMake`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir build-server
|
||||||
|
cd build-server
|
||||||
|
cmake -DLLAMA_BUILD_SERVER=ON ..
|
||||||
|
cmake --build . --config Release
|
||||||
|
```
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
|
||||||
To get started right away, run the following command, making sure to use the correct path for the model you have:
|
To get started right away, run the following command, making sure to use the correct path for the model you have:
|
||||||
|
|
||||||
#### Unix-based systems (Linux, macOS, etc.):
|
### Unix-based systems (Linux, macOS, etc.):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./server -m models/7B/ggml-model.bin --ctx_size 2048
|
./server -m models/7B/ggml-model.bin -c 2048
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Windows:
|
### Windows:
|
||||||
|
|
||||||
```powershell
|
```powershell
|
||||||
server.exe -m models\7B\ggml-model.bin --ctx_size 2048
|
server.exe -m models\7B\ggml-model.bin -c 2048
|
||||||
```
|
```
|
||||||
|
|
||||||
That will start a server that by default listens on `127.0.0.1:8080`. You can consume the endpoints with Postman or NodeJS with axios library.
|
The above command will start a server that by default listens on `127.0.0.1:8080`.
|
||||||
|
You can consume the endpoints with Postman or NodeJS with axios library.
|
||||||
|
|
||||||
|
## Testing with CURL
|
||||||
|
|
||||||
|
Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS.
|
||||||
|
|
||||||
|
```sh
|
||||||
|
curl --request POST \
|
||||||
|
--url http://localhost:8080/completion \
|
||||||
|
--data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
|
||||||
|
```
|
||||||
|
|
||||||
## Node JS Test
|
## Node JS Test
|
||||||
|
|
||||||
|
@ -50,7 +92,6 @@ const prompt = `Building a website can be done in 10 simple steps:`;
|
||||||
async function Test() {
|
async function Test() {
|
||||||
let result = await axios.post("http://127.0.0.1:8080/completion", {
|
let result = await axios.post("http://127.0.0.1:8080/completion", {
|
||||||
prompt,
|
prompt,
|
||||||
batch_size: 128,
|
|
||||||
n_predict: 512,
|
n_predict: 512,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -69,246 +110,83 @@ node .
|
||||||
|
|
||||||
## API Endpoints
|
## API Endpoints
|
||||||
|
|
||||||
You can interact with this API Endpoints. This implementations just support chat style interaction.
|
- **POST** `/completion`: Given a prompt, it returns the predicted completion.
|
||||||
|
|
||||||
- **POST** `hostname:port/completion`: Setting up the Llama Context to begin the completions tasks.
|
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
`batch_size`: Set the batch size for prompt processing (default: 512).
|
|
||||||
|
|
||||||
`temperature`: Adjust the randomness of the generated text (default: 0.8).
|
`temperature`: Adjust the randomness of the generated text (default: 0.8).
|
||||||
|
|
||||||
`top_k`: Limit the next token selection to the K most probable tokens (default: 40).
|
`top_k`: Limit the next token selection to the K most probable tokens (default: 40).
|
||||||
|
|
||||||
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
|
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
|
||||||
|
|
||||||
`n_predict`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
|
`n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: 128, -1 = infinity).
|
||||||
|
|
||||||
`threads`: Set the number of threads to use during computation.
|
`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
|
||||||
|
By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
|
||||||
|
|
||||||
`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
|
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
||||||
|
|
||||||
`as_loop`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does.
|
||||||
|
|
||||||
`interactive`: It allows interacting with the completion, and the completion stops as soon as it encounters a `stop word`. To enable this, set to `true`.
|
`stop`: Specify a JSON array of stopping strings.
|
||||||
|
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
|
||||||
|
|
||||||
`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate.
|
`tfs_z`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
|
||||||
|
|
||||||
`stop`: Specify the words or characters that indicate a stop. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration.
|
`typical_p`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
|
||||||
|
|
||||||
`exclude`: Specify the words or characters you do not want to appear in the completion. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration.
|
`repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1).
|
||||||
|
|
||||||
- **POST** `hostname:port/embedding`: Generate embedding of a given text
|
`repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
|
||||||
|
|
||||||
*Options:*
|
`penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true).
|
||||||
|
|
||||||
`content`: Set the text to get generate the embedding.
|
`presence_penalty`: Repeat alpha presence penalty (default: 0.0, 0.0 = disabled).
|
||||||
|
|
||||||
`threads`: Set the number of threads to use during computation.
|
`frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled);
|
||||||
|
|
||||||
To use this endpoint, you need to start the server with the `--embedding` option added.
|
`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
|
||||||
|
|
||||||
- **POST** `hostname:port/tokenize`: Tokenize a given text
|
`mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0).
|
||||||
|
|
||||||
|
`mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).
|
||||||
|
|
||||||
|
`seed`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
|
||||||
|
|
||||||
|
`ignore_eos`: Ignore end of stream token and continue generating (default: false).
|
||||||
|
|
||||||
|
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []).
|
||||||
|
|
||||||
|
- **POST** `/tokenize`: Tokenize a given text.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
`content`: Set the text to tokenize.
|
`content`: Set the text to tokenize.
|
||||||
|
|
||||||
- **GET** `hostname:port/next-token`: Receive the next token predicted, execute this request in a loop. Make sure set `as_loop` as `true` in the completion request.
|
Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`.
|
||||||
|
|
||||||
|
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
`stop`: Set `hostname:port/next-token?stop=true` to stop the token generation.
|
`content`: Set the text to process.
|
||||||
|
|
||||||
## More examples
|
## More examples
|
||||||
|
|
||||||
### Interactive mode
|
### Interactive mode
|
||||||
|
|
||||||
This mode allows interacting in a chat-like manner. It is recommended for models designed as assistants such as `Vicuna`, `WizardLM`, `Koala`, among others. Make sure to add the correct stop word for the corresponding model.
|
Check the sample in [chat.mjs](chat.mjs).
|
||||||
|
Run with NodeJS version 16 or later:
|
||||||
|
|
||||||
The prompt should be generated by you, according to the model's guidelines. You should keep adding the model's completions to the context as well.
|
```sh
|
||||||
|
node chat.mjs
|
||||||
This example works well for `Vicuna - version 1`.
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
const axios = require("axios");
|
|
||||||
|
|
||||||
let prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
|
|
||||||
### Human: Hello, Assistant.
|
|
||||||
### Assistant: Hello. How may I help you today?
|
|
||||||
### Human: Please tell me the largest city in Europe.
|
|
||||||
### Assistant: Sure. The largest city in Europe is Moscow, the capital of Russia.`;
|
|
||||||
|
|
||||||
async function ChatCompletion(answer) {
|
|
||||||
// the user's next question to the prompt
|
|
||||||
prompt += `\n### Human: ${answer}\n`
|
|
||||||
|
|
||||||
result = await axios.post("http://127.0.0.1:8080/completion", {
|
|
||||||
prompt,
|
|
||||||
batch_size: 128,
|
|
||||||
temperature: 0.2,
|
|
||||||
top_k: 40,
|
|
||||||
top_p: 0.9,
|
|
||||||
n_keep: -1,
|
|
||||||
n_predict: 2048,
|
|
||||||
stop: ["\n### Human:"], // when detect this, stop completion
|
|
||||||
exclude: ["### Assistant:"], // no show in the completion
|
|
||||||
threads: 8,
|
|
||||||
as_loop: true, // use this to request the completion token by token
|
|
||||||
interactive: true, // enable the detection of a stop word
|
|
||||||
});
|
|
||||||
|
|
||||||
// create a loop to receive every token predicted
|
|
||||||
// note: this operation is blocking, avoid use this in a ui thread
|
|
||||||
|
|
||||||
let message = "";
|
|
||||||
while (true) {
|
|
||||||
// you can stop the inference adding '?stop=true' like this http://127.0.0.1:8080/next-token?stop=true
|
|
||||||
result = await axios.get("http://127.0.0.1:8080/next-token");
|
|
||||||
process.stdout.write(result.data.content);
|
|
||||||
message += result.data.content;
|
|
||||||
|
|
||||||
// to avoid an infinite loop
|
|
||||||
if (result.data.stop) {
|
|
||||||
console.log("Completed");
|
|
||||||
// make sure to add the completion to the prompt.
|
|
||||||
prompt += `### Assistant: ${message}`;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// This function should be called every time a question to the model is needed.
|
|
||||||
async function Test() {
|
|
||||||
// the server can't inference in paralell
|
|
||||||
await ChatCompletion("Write a long story about a time magician in a fantasy world");
|
|
||||||
await ChatCompletion("Summary the story");
|
|
||||||
}
|
|
||||||
|
|
||||||
Test();
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Alpaca example
|
Another sample in [chat.sh](chat.sh).
|
||||||
|
Requires [bash](https://www.gnu.org/software/bash/), [curl](https://curl.se) and [jq](https://jqlang.github.io/jq/).
|
||||||
|
Run with bash:
|
||||||
|
|
||||||
**Temporaly note:** no tested, if you have the model, please test it and report me some issue
|
```sh
|
||||||
|
bash chat.sh
|
||||||
```javascript
|
|
||||||
const axios = require("axios");
|
|
||||||
|
|
||||||
let prompt = `Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
|
||||||
`;
|
|
||||||
|
|
||||||
async function DoInstruction(instruction) {
|
|
||||||
prompt += `\n\n### Instruction:\n\n${instruction}\n\n### Response:\n\n`;
|
|
||||||
result = await axios.post("http://127.0.0.1:8080/completion", {
|
|
||||||
prompt,
|
|
||||||
batch_size: 128,
|
|
||||||
temperature: 0.2,
|
|
||||||
top_k: 40,
|
|
||||||
top_p: 0.9,
|
|
||||||
n_keep: -1,
|
|
||||||
n_predict: 2048,
|
|
||||||
stop: ["### Instruction:\n\n"], // when detect this, stop completion
|
|
||||||
exclude: [], // no show in the completion
|
|
||||||
threads: 8,
|
|
||||||
as_loop: true, // use this to request the completion token by token
|
|
||||||
interactive: true, // enable the detection of a stop word
|
|
||||||
});
|
|
||||||
|
|
||||||
// create a loop to receive every token predicted
|
|
||||||
// note: this operation is blocking, avoid use this in a ui thread
|
|
||||||
|
|
||||||
let message = "";
|
|
||||||
while (true) {
|
|
||||||
result = await axios.get("http://127.0.0.1:8080/next-token");
|
|
||||||
process.stdout.write(result.data.content);
|
|
||||||
message += result.data.content;
|
|
||||||
|
|
||||||
// to avoid an infinite loop
|
|
||||||
if (result.data.stop) {
|
|
||||||
console.log("Completed");
|
|
||||||
// make sure to add the completion and the user's next question to the prompt.
|
|
||||||
prompt += message;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// This function should be called every time a instruction to the model is needed.
|
|
||||||
DoInstruction("Destroy the world"); // as joke
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Embeddings
|
|
||||||
|
|
||||||
First, run the server with `--embedding` option:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
server -m models/7B/ggml-model.bin --ctx_size 2048 --embedding
|
|
||||||
```
|
|
||||||
|
|
||||||
Run this code in NodeJS:
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
const axios = require('axios');
|
|
||||||
|
|
||||||
async function Test() {
|
|
||||||
let result = await axios.post("http://127.0.0.1:8080/embedding", {
|
|
||||||
content: `Hello`,
|
|
||||||
threads: 5
|
|
||||||
});
|
|
||||||
// print the embedding array
|
|
||||||
console.log(result.data.embedding);
|
|
||||||
}
|
|
||||||
|
|
||||||
Test();
|
|
||||||
```
|
|
||||||
|
|
||||||
### Tokenize
|
|
||||||
|
|
||||||
Run this code in NodeJS:
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
const axios = require('axios');
|
|
||||||
|
|
||||||
async function Test() {
|
|
||||||
let result = await axios.post("http://127.0.0.1:8080/tokenize", {
|
|
||||||
content: `Hello`
|
|
||||||
});
|
|
||||||
// print the embedding array
|
|
||||||
console.log(result.data.tokens);
|
|
||||||
}
|
|
||||||
|
|
||||||
Test();
|
|
||||||
```
|
|
||||||
|
|
||||||
## Common Options
|
|
||||||
|
|
||||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
|
||||||
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
|
||||||
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
|
||||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
|
||||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
|
||||||
- `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**.
|
|
||||||
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`;
|
|
||||||
- `--port`: Set the port to listen. Default: `8080`.
|
|
||||||
|
|
||||||
### RNG Seed
|
|
||||||
|
|
||||||
- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
|
|
||||||
|
|
||||||
The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.
|
|
||||||
|
|
||||||
## Performance Tuning and Memory Options
|
|
||||||
|
|
||||||
### No Memory Mapping
|
|
||||||
|
|
||||||
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance.
|
|
||||||
|
|
||||||
### Memory Float 32
|
|
||||||
|
|
||||||
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement but does not appear to increase generation quality in a measurable way. Not recommended.
|
|
||||||
|
|
||||||
## Limitations:
|
|
||||||
|
|
||||||
- The actual implementation of llama.cpp need a `llama-state` for handle multiple contexts and clients, but this could require more powerful hardware.
|
|
||||||
|
|
89
examples/server/chat.mjs
Normal file
89
examples/server/chat.mjs
Normal file
|
@ -0,0 +1,89 @@
|
||||||
|
import * as readline from 'node:readline'
|
||||||
|
import { stdin, stdout } from 'node:process'
|
||||||
|
|
||||||
|
const API_URL = 'http://127.0.0.1:8080'
|
||||||
|
|
||||||
|
const chat = [
|
||||||
|
{
|
||||||
|
human: "Hello, Assistant.",
|
||||||
|
assistant: "Hello. How may I help you today?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
human: "Please tell me the largest city in Europe.",
|
||||||
|
assistant: "Sure. The largest city in Europe is Moscow, the capital of Russia."
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
const instruction = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.`
|
||||||
|
|
||||||
|
function format_prompt(question) {
|
||||||
|
return `${instruction}\n${
|
||||||
|
chat.map(m =>`### Human: ${m.human}\n### Assistant: ${m.assistant}`).join("\n")
|
||||||
|
}\n### Human: ${question}\n### Assistant:`
|
||||||
|
}
|
||||||
|
|
||||||
|
async function tokenize(content) {
|
||||||
|
const result = await fetch(`${API_URL}/tokenize`, {
|
||||||
|
method: 'POST',
|
||||||
|
body: JSON.stringify({ content })
|
||||||
|
})
|
||||||
|
|
||||||
|
if (!result.ok) {
|
||||||
|
return []
|
||||||
|
}
|
||||||
|
|
||||||
|
return await result.json().tokens
|
||||||
|
}
|
||||||
|
|
||||||
|
const n_keep = await tokenize(instruction).length
|
||||||
|
|
||||||
|
async function chat_completion(question) {
|
||||||
|
const result = await fetch(`${API_URL}/completion`, {
|
||||||
|
method: 'POST',
|
||||||
|
body: JSON.stringify({
|
||||||
|
prompt: format_prompt(question),
|
||||||
|
temperature: 0.2,
|
||||||
|
top_k: 40,
|
||||||
|
top_p: 0.9,
|
||||||
|
n_keep: n_keep,
|
||||||
|
n_predict: 256,
|
||||||
|
stop: ["\n### Human:"], // stop completion after generating this
|
||||||
|
stream: true,
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
if (!result.ok) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
let answer = ''
|
||||||
|
|
||||||
|
for await (var chunk of result.body) {
|
||||||
|
const t = Buffer.from(chunk).toString('utf8')
|
||||||
|
if (t.startsWith('data: ')) {
|
||||||
|
const message = JSON.parse(t.substring(6))
|
||||||
|
answer += message.content
|
||||||
|
process.stdout.write(message.content)
|
||||||
|
if (message.stop) {
|
||||||
|
if (message.truncated) {
|
||||||
|
chat.shift()
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
process.stdout.write('\n')
|
||||||
|
chat.push({ human: question, assistant: answer.trimStart() })
|
||||||
|
}
|
||||||
|
|
||||||
|
const rl = readline.createInterface({ input: stdin, output: stdout });
|
||||||
|
|
||||||
|
const readlineQuestion = (rl, query, options) => new Promise((resolve, reject) => {
|
||||||
|
rl.question(query, options, resolve)
|
||||||
|
});
|
||||||
|
|
||||||
|
while(true) {
|
||||||
|
const question = await readlineQuestion(rl, '> ')
|
||||||
|
await chat_completion(question)
|
||||||
|
}
|
77
examples/server/chat.sh
Normal file
77
examples/server/chat.sh
Normal file
|
@ -0,0 +1,77 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
API_URL="${API_URL:-http://127.0.0.1:8080}"
|
||||||
|
|
||||||
|
CHAT=(
|
||||||
|
"Hello, Assistant."
|
||||||
|
"Hello. How may I help you today?"
|
||||||
|
"Please tell me the largest city in Europe."
|
||||||
|
"Sure. The largest city in Europe is Moscow, the capital of Russia."
|
||||||
|
)
|
||||||
|
|
||||||
|
INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
|
||||||
|
|
||||||
|
trim() {
|
||||||
|
shopt -s extglob
|
||||||
|
set -- "${1##+([[:space:]])}"
|
||||||
|
printf "%s" "${1%%+([[:space:]])}"
|
||||||
|
}
|
||||||
|
|
||||||
|
trim_trailing() {
|
||||||
|
shopt -s extglob
|
||||||
|
printf "%s" "${1%%+([[:space:]])}"
|
||||||
|
}
|
||||||
|
|
||||||
|
format_prompt() {
|
||||||
|
echo -n "${INSTRUCTION}"
|
||||||
|
printf "\n### Human: %s\n### Assistant: %s" "${CHAT[@]}" "$1"
|
||||||
|
}
|
||||||
|
|
||||||
|
tokenize() {
|
||||||
|
curl \
|
||||||
|
--silent \
|
||||||
|
--request POST \
|
||||||
|
--url "${API_URL}/tokenize" \
|
||||||
|
--data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
|
||||||
|
| jq '.tokens[]'
|
||||||
|
}
|
||||||
|
|
||||||
|
N_KEEP=$(tokenize "${INSTRUCTION}" | wc -l)
|
||||||
|
|
||||||
|
chat_completion() {
|
||||||
|
PROMPT="$(trim_trailing "$(format_prompt "$1")")"
|
||||||
|
DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
|
||||||
|
prompt: .,
|
||||||
|
temperature: 0.2,
|
||||||
|
top_k: 40,
|
||||||
|
top_p: 0.9,
|
||||||
|
n_keep: $n_keep,
|
||||||
|
n_predict: 256,
|
||||||
|
stop: ["\n### Human:"],
|
||||||
|
stream: true
|
||||||
|
}')"
|
||||||
|
|
||||||
|
ANSWER=''
|
||||||
|
|
||||||
|
while IFS= read -r LINE; do
|
||||||
|
if [[ $LINE = data:* ]]; then
|
||||||
|
CONTENT="$(echo "${LINE:5}" | jq -r '.content')"
|
||||||
|
printf "%s" "${CONTENT}"
|
||||||
|
ANSWER+="${CONTENT}"
|
||||||
|
fi
|
||||||
|
done < <(curl \
|
||||||
|
--silent \
|
||||||
|
--no-buffer \
|
||||||
|
--request POST \
|
||||||
|
--url "${API_URL}/completion" \
|
||||||
|
--data-raw "${DATA}")
|
||||||
|
|
||||||
|
printf "\n"
|
||||||
|
|
||||||
|
CHAT+=("$1" "$(trim "$ANSWER")")
|
||||||
|
}
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
read -r -e -p "> " QUESTION
|
||||||
|
chat_completion "${QUESTION}"
|
||||||
|
done
|
File diff suppressed because it is too large
Load diff
7
examples/simple/CMakeLists.txt
Normal file
7
examples/simple/CMakeLists.txt
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
set(TARGET simple)
|
||||||
|
add_executable(${TARGET} simple.cpp)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
if(TARGET BUILD_INFO)
|
||||||
|
add_dependencies(${TARGET} BUILD_INFO)
|
||||||
|
endif()
|
179
examples/simple/simple.cpp
Normal file
179
examples/simple/simple.cpp
Normal file
|
@ -0,0 +1,179 @@
|
||||||
|
#ifndef _GNU_SOURCE
|
||||||
|
#define _GNU_SOURCE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "build-info.h"
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <cinttypes>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <ctime>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||||
|
#include <signal.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#elif defined (_WIN32)
|
||||||
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#define NOMINMAX
|
||||||
|
#include <windows.h>
|
||||||
|
#include <signal.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char ** argv)
|
||||||
|
{
|
||||||
|
gpt_params params;
|
||||||
|
|
||||||
|
//---------------------------------
|
||||||
|
// Print help :
|
||||||
|
//---------------------------------
|
||||||
|
|
||||||
|
if ( argc == 1 || argv[1][0] == '-' )
|
||||||
|
{
|
||||||
|
printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
|
||||||
|
return 1 ;
|
||||||
|
}
|
||||||
|
|
||||||
|
//---------------------------------
|
||||||
|
// Load parameters :
|
||||||
|
//---------------------------------
|
||||||
|
|
||||||
|
if ( argc >= 2 )
|
||||||
|
{
|
||||||
|
params.model = argv[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( argc >= 3 )
|
||||||
|
{
|
||||||
|
params.prompt = argv[2];
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( params.prompt.empty() )
|
||||||
|
{
|
||||||
|
params.prompt = "Hello my name is";
|
||||||
|
}
|
||||||
|
|
||||||
|
//---------------------------------
|
||||||
|
// Init LLM :
|
||||||
|
//---------------------------------
|
||||||
|
|
||||||
|
llama_init_backend();
|
||||||
|
|
||||||
|
llama_model * model;
|
||||||
|
llama_context * ctx;
|
||||||
|
|
||||||
|
std::tie(model, ctx) = llama_init_from_gpt_params( params );
|
||||||
|
|
||||||
|
if ( model == NULL )
|
||||||
|
{
|
||||||
|
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
//---------------------------------
|
||||||
|
// Tokenize the prompt :
|
||||||
|
//---------------------------------
|
||||||
|
|
||||||
|
std::vector<llama_token> tokens_list;
|
||||||
|
tokens_list = ::llama_tokenize( ctx , params.prompt , true );
|
||||||
|
|
||||||
|
const int max_context_size = llama_n_ctx( ctx );
|
||||||
|
const int max_tokens_list_size = max_context_size - 4 ;
|
||||||
|
|
||||||
|
if ( (int)tokens_list.size() > max_tokens_list_size )
|
||||||
|
{
|
||||||
|
fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
|
||||||
|
__func__ , (int)tokens_list.size() , max_tokens_list_size );
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf( stderr, "\n\n" );
|
||||||
|
|
||||||
|
// Print the tokens from the prompt :
|
||||||
|
|
||||||
|
for( auto id : tokens_list )
|
||||||
|
{
|
||||||
|
printf( "%s" , llama_token_to_str( ctx , id ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
|
||||||
|
//---------------------------------
|
||||||
|
// Main prediction loop :
|
||||||
|
//---------------------------------
|
||||||
|
|
||||||
|
// The LLM keeps a contextual cache memory of previous token evaluation.
|
||||||
|
// Usually, once this cache is full, it is required to recompute a compressed context based on previous
|
||||||
|
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
|
||||||
|
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
|
||||||
|
|
||||||
|
while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
|
||||||
|
{
|
||||||
|
//---------------------------------
|
||||||
|
// Evaluate the tokens :
|
||||||
|
//---------------------------------
|
||||||
|
|
||||||
|
if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
|
||||||
|
{
|
||||||
|
fprintf( stderr, "%s : failed to eval\n" , __func__ );
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
tokens_list.clear();
|
||||||
|
|
||||||
|
//---------------------------------
|
||||||
|
// Select the best prediction :
|
||||||
|
//---------------------------------
|
||||||
|
|
||||||
|
llama_token new_token_id = 0;
|
||||||
|
|
||||||
|
auto logits = llama_get_logits( ctx );
|
||||||
|
auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
|
||||||
|
|
||||||
|
std::vector<llama_token_data> candidates;
|
||||||
|
candidates.reserve( n_vocab );
|
||||||
|
|
||||||
|
for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
|
||||||
|
{
|
||||||
|
candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||||
|
|
||||||
|
// Select it using the "Greedy sampling" method :
|
||||||
|
new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
|
||||||
|
|
||||||
|
|
||||||
|
// is it an end of stream ?
|
||||||
|
if ( new_token_id == llama_token_eos() )
|
||||||
|
{
|
||||||
|
fprintf(stderr, " [end of text]\n");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Print the new token :
|
||||||
|
printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
|
||||||
|
fflush( stdout );
|
||||||
|
|
||||||
|
// Push this new token for next evaluation :
|
||||||
|
tokens_list.push_back( new_token_id );
|
||||||
|
|
||||||
|
} // wend of main loop
|
||||||
|
|
||||||
|
llama_free( ctx );
|
||||||
|
llama_free_model( model );
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// EOF
|
4
examples/train-text-from-scratch/CMakeLists.txt
Normal file
4
examples/train-text-from-scratch/CMakeLists.txt
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
set(TARGET train-text-from-scratch)
|
||||||
|
add_executable(${TARGET} train-text-from-scratch.cpp)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
22
examples/train-text-from-scratch/README.md
Normal file
22
examples/train-text-from-scratch/README.md
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
# train-text-from-scratch
|
||||||
|
|
||||||
|
Basic usage instructions:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# get training data
|
||||||
|
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
|
||||||
|
|
||||||
|
# train
|
||||||
|
./bin/train-text-from-scratch \
|
||||||
|
--vocab-model ../models/ggml-vocab.bin \
|
||||||
|
--ctx 64 --embd 256 --head 8 --layer 16 \
|
||||||
|
--checkpoint-in chk-shakespeare-256x16.bin \
|
||||||
|
--checkpoint-out chk-shakespeare-256x16.bin \
|
||||||
|
--model-out ggml-shakespeare-256x16-f32.bin \
|
||||||
|
--train-data "shakespeare.txt" \
|
||||||
|
-t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \
|
||||||
|
--print-details-interval 0 --predict 16 --use-flash
|
||||||
|
|
||||||
|
# predict
|
||||||
|
./bin/main -m ggml-shakespeare-256x16-f32.bin
|
||||||
|
```
|
3404
examples/train-text-from-scratch/train-text-from-scratch.cpp
Normal file
3404
examples/train-text-from-scratch/train-text-from-scratch.cpp
Normal file
File diff suppressed because it is too large
Load diff
63
flake.nix
63
flake.nix
|
@ -9,27 +9,33 @@
|
||||||
inherit (pkgs.stdenv) isAarch64 isDarwin;
|
inherit (pkgs.stdenv) isAarch64 isDarwin;
|
||||||
inherit (pkgs.lib) optionals;
|
inherit (pkgs.lib) optionals;
|
||||||
isM1 = isAarch64 && isDarwin;
|
isM1 = isAarch64 && isDarwin;
|
||||||
osSpecific =
|
osSpecific = if isM1 then
|
||||||
if isM1 then with pkgs.darwin.apple_sdk_11_0.frameworks; [ Accelerate MetalKit MetalPerformanceShaders MetalPerformanceShadersGraph ]
|
with pkgs.darwin.apple_sdk_11_0.frameworks; [
|
||||||
else if isDarwin then with pkgs.darwin.apple_sdk.frameworks; [ Accelerate CoreGraphics CoreVideo ]
|
Accelerate
|
||||||
else [ ];
|
MetalKit
|
||||||
pkgs = import nixpkgs {
|
MetalPerformanceShaders
|
||||||
inherit system;
|
MetalPerformanceShadersGraph
|
||||||
};
|
]
|
||||||
llama-python = pkgs.python310.withPackages (ps: with ps; [
|
else if isDarwin then
|
||||||
numpy
|
with pkgs.darwin.apple_sdk.frameworks; [
|
||||||
sentencepiece
|
Accelerate
|
||||||
]);
|
CoreGraphics
|
||||||
in
|
CoreVideo
|
||||||
{
|
]
|
||||||
|
else
|
||||||
|
[ ];
|
||||||
|
pkgs = import nixpkgs { inherit system; };
|
||||||
|
llama-python =
|
||||||
|
pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
||||||
|
in {
|
||||||
packages.default = pkgs.stdenv.mkDerivation {
|
packages.default = pkgs.stdenv.mkDerivation {
|
||||||
name = "llama.cpp";
|
name = "llama.cpp";
|
||||||
src = ./.;
|
src = ./.;
|
||||||
postPatch =
|
postPatch = if isM1 then ''
|
||||||
if isM1 then ''
|
|
||||||
substituteInPlace ./ggml-metal.m \
|
substituteInPlace ./ggml-metal.m \
|
||||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/ggml-metal.metal\";"
|
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||||
'' else "";
|
'' else
|
||||||
|
"";
|
||||||
nativeBuildInputs = with pkgs; [ cmake ];
|
nativeBuildInputs = with pkgs; [ cmake ];
|
||||||
buildInputs = osSpecific;
|
buildInputs = osSpecific;
|
||||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [
|
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [
|
||||||
|
@ -48,12 +54,21 @@
|
||||||
'';
|
'';
|
||||||
meta.mainProgram = "llama";
|
meta.mainProgram = "llama";
|
||||||
};
|
};
|
||||||
devShells.default = pkgs.mkShell {
|
apps.llama-server = {
|
||||||
packages = with pkgs; [
|
type = "app";
|
||||||
cmake
|
program = "${self.packages.${system}.default}/bin/llama-server";
|
||||||
llama-python
|
|
||||||
] ++ osSpecific;
|
|
||||||
};
|
};
|
||||||
}
|
apps.llama-embedding = {
|
||||||
);
|
type = "app";
|
||||||
|
program = "${self.packages.${system}.default}/bin/embedding";
|
||||||
|
};
|
||||||
|
apps.llama = {
|
||||||
|
type = "app";
|
||||||
|
program = "${self.packages.${system}.default}/bin/llama";
|
||||||
|
};
|
||||||
|
apps.default = self.apps.${system}.llama;
|
||||||
|
devShells.default = pkgs.mkShell {
|
||||||
|
packages = with pkgs; [ cmake llama-python ] ++ osSpecific;
|
||||||
|
};
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
1557
ggml-cuda.cu
1557
ggml-cuda.cu
File diff suppressed because it is too large
Load diff
|
@ -28,8 +28,10 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
||||||
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
||||||
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
||||||
void ggml_cuda_set_main_device(int main_device);
|
void ggml_cuda_set_main_device(int main_device);
|
||||||
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
||||||
|
void ggml_cuda_free_scratch(void);
|
||||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
|
@ -41,12 +41,15 @@ void ggml_metal_free(struct ggml_metal_context * ctx);
|
||||||
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
||||||
// - the mapping is used during computation to determine the arguments of the compute kernels
|
// - the mapping is used during computation to determine the arguments of the compute kernels
|
||||||
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
|
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
|
||||||
|
// - max_size specifies the maximum size of a tensor and is used to create shared views such
|
||||||
|
// that it is guaranteed that the tensor will fit in at least one of the views
|
||||||
//
|
//
|
||||||
bool ggml_metal_add_buffer(
|
bool ggml_metal_add_buffer(
|
||||||
struct ggml_metal_context * ctx,
|
struct ggml_metal_context * ctx,
|
||||||
const char * name,
|
const char * name,
|
||||||
void * data,
|
void * data,
|
||||||
size_t size);
|
size_t size,
|
||||||
|
size_t max_size);
|
||||||
|
|
||||||
// set data from host memory into the device
|
// set data from host memory into the device
|
||||||
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
||||||
|
@ -55,6 +58,7 @@ void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
|
||||||
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
||||||
|
|
||||||
// same as ggml_graph_compute but uses Metal
|
// same as ggml_graph_compute but uses Metal
|
||||||
|
// creates gf->n_threads command buffers in parallel
|
||||||
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
246
ggml-metal.m
246
ggml-metal.m
|
@ -52,18 +52,25 @@ struct ggml_metal_context {
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q2_k);
|
GGML_METAL_DECL_KERNEL(get_rows_q2_k);
|
||||||
|
GGML_METAL_DECL_KERNEL(get_rows_q3_k);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q4_k);
|
GGML_METAL_DECL_KERNEL(get_rows_q4_k);
|
||||||
|
GGML_METAL_DECL_KERNEL(get_rows_q5_k);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q6_k);
|
GGML_METAL_DECL_KERNEL(get_rows_q6_k);
|
||||||
GGML_METAL_DECL_KERNEL(rms_norm);
|
GGML_METAL_DECL_KERNEL(rms_norm);
|
||||||
|
GGML_METAL_DECL_KERNEL(norm);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q2_k_f32);
|
GGML_METAL_DECL_KERNEL(mul_mat_q2_k_f32);
|
||||||
|
GGML_METAL_DECL_KERNEL(mul_mat_q3_k_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_k_f32);
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_k_f32);
|
||||||
|
GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32);
|
GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32);
|
||||||
GGML_METAL_DECL_KERNEL(rope);
|
GGML_METAL_DECL_KERNEL(rope);
|
||||||
|
GGML_METAL_DECL_KERNEL(alibi_f32);
|
||||||
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
||||||
GGML_METAL_DECL_KERNEL(cpy_f32_f32);
|
GGML_METAL_DECL_KERNEL(cpy_f32_f32);
|
||||||
|
GGML_METAL_DECL_KERNEL(cpy_f16_f16);
|
||||||
|
|
||||||
#undef GGML_METAL_DECL_KERNEL
|
#undef GGML_METAL_DECL_KERNEL
|
||||||
};
|
};
|
||||||
|
@ -153,22 +160,37 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q2_k);
|
GGML_METAL_ADD_KERNEL(get_rows_q2_k);
|
||||||
|
GGML_METAL_ADD_KERNEL(get_rows_q3_k);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q4_k);
|
GGML_METAL_ADD_KERNEL(get_rows_q4_k);
|
||||||
|
GGML_METAL_ADD_KERNEL(get_rows_q5_k);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q6_k);
|
GGML_METAL_ADD_KERNEL(get_rows_q6_k);
|
||||||
GGML_METAL_ADD_KERNEL(rms_norm);
|
GGML_METAL_ADD_KERNEL(rms_norm);
|
||||||
|
GGML_METAL_ADD_KERNEL(norm);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q2_k_f32);
|
GGML_METAL_ADD_KERNEL(mul_mat_q2_k_f32);
|
||||||
|
GGML_METAL_ADD_KERNEL(mul_mat_q3_k_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_k_f32);
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_k_f32);
|
||||||
|
GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32);
|
GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32);
|
||||||
GGML_METAL_ADD_KERNEL(rope);
|
GGML_METAL_ADD_KERNEL(rope);
|
||||||
|
GGML_METAL_ADD_KERNEL(alibi_f32);
|
||||||
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
||||||
GGML_METAL_ADD_KERNEL(cpy_f32_f32);
|
GGML_METAL_ADD_KERNEL(cpy_f32_f32);
|
||||||
|
GGML_METAL_ADD_KERNEL(cpy_f16_f16);
|
||||||
|
|
||||||
#undef GGML_METAL_ADD_KERNEL
|
#undef GGML_METAL_ADD_KERNEL
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
|
fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||||
|
if (ctx->device.maxTransferRate != 0) {
|
||||||
|
fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__);
|
||||||
|
}
|
||||||
|
|
||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -185,10 +207,13 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||||
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
||||||
//fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
//fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
||||||
|
|
||||||
|
const int64_t tsize = ggml_nbytes(t);
|
||||||
|
|
||||||
|
// find the view that contains the tensor fully
|
||||||
for (int i = 0; i < ctx->n_buffers; ++i) {
|
for (int i = 0; i < ctx->n_buffers; ++i) {
|
||||||
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
||||||
|
|
||||||
if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
|
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
||||||
*offs = (size_t) ioffs;
|
*offs = (size_t) ioffs;
|
||||||
|
|
||||||
//fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
|
//fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
|
||||||
|
@ -206,7 +231,8 @@ bool ggml_metal_add_buffer(
|
||||||
struct ggml_metal_context * ctx,
|
struct ggml_metal_context * ctx,
|
||||||
const char * name,
|
const char * name,
|
||||||
void * data,
|
void * data,
|
||||||
size_t size) {
|
size_t size,
|
||||||
|
size_t max_size) {
|
||||||
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
|
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
|
||||||
fprintf(stderr, "%s: too many buffers\n", __func__);
|
fprintf(stderr, "%s: too many buffers\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
|
@ -223,31 +249,69 @@ bool ggml_metal_add_buffer(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t page_size = getpagesize();
|
const size_t size_page = getpagesize();
|
||||||
size_t aligned_size = size;
|
|
||||||
if ((aligned_size % page_size) != 0) {
|
size_t size_aligned = size;
|
||||||
aligned_size += (page_size - (aligned_size % page_size));
|
if ((size_aligned % size_page) != 0) {
|
||||||
|
size_aligned += (size_page - (size_aligned % size_page));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// the buffer fits into the max buffer size allowed by the device
|
||||||
|
if (size_aligned <= ctx->device.maxBufferLength) {
|
||||||
ctx->buffers[ctx->n_buffers].name = name;
|
ctx->buffers[ctx->n_buffers].name = name;
|
||||||
ctx->buffers[ctx->n_buffers].data = data;
|
ctx->buffers[ctx->n_buffers].data = data;
|
||||||
ctx->buffers[ctx->n_buffers].size = size;
|
ctx->buffers[ctx->n_buffers].size = size;
|
||||||
|
|
||||||
if (ctx->device.maxBufferLength < aligned_size) {
|
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
fprintf(stderr, "%s: buffer '%s' size %zu is larger than buffer maximum of %zu\n", __func__, name, aligned_size, ctx->device.maxBufferLength);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];
|
|
||||||
|
|
||||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
|
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
++ctx->n_buffers;
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
|
// this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
|
||||||
|
// one of the views
|
||||||
|
const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
|
||||||
|
const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
|
||||||
|
const size_t size_view = ctx->device.maxBufferLength;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < size; i += size_step) {
|
||||||
|
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
||||||
|
|
||||||
|
ctx->buffers[ctx->n_buffers].name = name;
|
||||||
|
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
|
||||||
|
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
||||||
|
|
||||||
|
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
|
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
||||||
|
if (i + size_step < size) {
|
||||||
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
++ctx->n_buffers;
|
++ctx->n_buffers;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, ", (%8.2f / %8.2f)",
|
||||||
|
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
||||||
|
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
|
||||||
|
fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -279,15 +343,40 @@ void ggml_metal_graph_compute(
|
||||||
struct ggml_cgraph * gf) {
|
struct ggml_cgraph * gf) {
|
||||||
metal_printf("%s: evaluating graph\n", __func__);
|
metal_printf("%s: evaluating graph\n", __func__);
|
||||||
|
|
||||||
|
// create multiple command buffers and enqueue them
|
||||||
|
// then, we encode the graph into the command buffers in parallel
|
||||||
|
|
||||||
|
const int n_cb = gf->n_threads;
|
||||||
|
|
||||||
|
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
|
||||||
|
|
||||||
|
for (int i = 0; i < n_cb; ++i) {
|
||||||
|
command_buffers[i] = [ctx->queue commandBuffer];
|
||||||
|
|
||||||
|
// enqueue the command buffers in order to specify their execution order
|
||||||
|
[command_buffers[i] enqueue];
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: is this the best way to start threads?
|
||||||
|
dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
|
||||||
|
|
||||||
|
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
||||||
|
const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
|
||||||
|
|
||||||
|
dispatch_async(queue, ^{
|
||||||
size_t offs_src0 = 0;
|
size_t offs_src0 = 0;
|
||||||
size_t offs_src1 = 0;
|
size_t offs_src1 = 0;
|
||||||
size_t offs_dst = 0;
|
size_t offs_dst = 0;
|
||||||
|
|
||||||
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer];
|
id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
|
||||||
|
|
||||||
id<MTLComputeCommandEncoder> encoder = nil;
|
id<MTLComputeCommandEncoder> encoder = nil;
|
||||||
|
|
||||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
const int node_start = (cb_idx + 0) * n_nodes_per_cb;
|
||||||
//metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
const int node_end = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
|
||||||
|
|
||||||
|
for (int i = node_start; i < node_end; ++i) {
|
||||||
|
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
||||||
|
|
||||||
struct ggml_tensor * src0 = gf->nodes[i]->src0;
|
struct ggml_tensor * src0 = gf->nodes[i]->src0;
|
||||||
struct ggml_tensor * src1 = gf->nodes[i]->src1;
|
struct ggml_tensor * src1 = gf->nodes[i]->src1;
|
||||||
|
@ -575,6 +664,15 @@ void ggml_metal_graph_compute(
|
||||||
nth1 = 16;
|
nth1 = 16;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32];
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_TYPE_Q3_K:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(ne02 == 1);
|
||||||
|
GGML_ASSERT(ne12 == 1);
|
||||||
|
|
||||||
|
nth0 = 4;
|
||||||
|
nth1 = 16;
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32];
|
||||||
|
} break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ne02 == 1);
|
GGML_ASSERT(ne02 == 1);
|
||||||
|
@ -584,6 +682,15 @@ void ggml_metal_graph_compute(
|
||||||
nth1 = 16;
|
nth1 = 16;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(ne02 == 1);
|
||||||
|
GGML_ASSERT(ne12 == 1);
|
||||||
|
|
||||||
|
nth0 = 4;
|
||||||
|
nth1 = 16;
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32];
|
||||||
|
} break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ne02 == 1);
|
GGML_ASSERT(ne02 == 1);
|
||||||
|
@ -600,7 +707,6 @@ void ggml_metal_graph_compute(
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||||
|
@ -620,15 +726,14 @@ void ggml_metal_graph_compute(
|
||||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
|
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
|
||||||
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
} else if (src0t == GGML_TYPE_Q2_K) {
|
}
|
||||||
|
else if (src0t == GGML_TYPE_Q2_K ||
|
||||||
|
src0t == GGML_TYPE_Q3_K ||
|
||||||
|
src0t == GGML_TYPE_Q4_K ||
|
||||||
|
src0t == GGML_TYPE_Q5_K ||
|
||||||
|
src0t == GGML_TYPE_Q6_K) {
|
||||||
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
} else if (src0t == GGML_TYPE_Q4_K) {
|
|
||||||
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
|
||||||
} else if (src0t == GGML_TYPE_Q6_K) {
|
|
||||||
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
|
||||||
} else {
|
} else {
|
||||||
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
|
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
|
@ -646,7 +751,9 @@ void ggml_metal_graph_compute(
|
||||||
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
||||||
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
||||||
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break;
|
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break;
|
||||||
|
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break;
|
||||||
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
|
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
|
||||||
|
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break;
|
||||||
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break;
|
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break;
|
||||||
default: GGML_ASSERT(false && "not implemented");
|
default: GGML_ASSERT(false && "not implemented");
|
||||||
}
|
}
|
||||||
|
@ -684,6 +791,70 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_NORM:
|
||||||
|
{
|
||||||
|
if (encoder == nil) {
|
||||||
|
encoder = [command_buffer computeCommandEncoder];
|
||||||
|
}
|
||||||
|
|
||||||
|
const float eps = 1e-5f;
|
||||||
|
|
||||||
|
const int nth = 256;
|
||||||
|
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_norm];
|
||||||
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||||
|
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
||||||
|
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
||||||
|
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
||||||
|
|
||||||
|
const int64_t nrows = ggml_nrows(src0);
|
||||||
|
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
|
} break;
|
||||||
|
case GGML_OP_ALIBI:
|
||||||
|
{
|
||||||
|
if (encoder == nil) {
|
||||||
|
encoder = [command_buffer computeCommandEncoder];
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT((src0t == GGML_TYPE_F32));
|
||||||
|
|
||||||
|
const int n_past = ((int32_t *) src1->data)[0]; UNUSED(n_past);
|
||||||
|
const int n_head = ((int32_t *) src1->data)[1];
|
||||||
|
const float max_bias = ((float *) src1->data)[2];
|
||||||
|
|
||||||
|
if (__builtin_popcount(n_head) != 1) {
|
||||||
|
GGML_ASSERT(false && "only power-of-two n_head implemented");
|
||||||
|
}
|
||||||
|
|
||||||
|
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
||||||
|
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
||||||
|
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_alibi_f32];
|
||||||
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||||
|
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||||
|
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||||
|
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
||||||
|
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
||||||
|
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
||||||
|
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
||||||
|
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
||||||
|
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
||||||
|
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
||||||
|
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
||||||
|
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
||||||
|
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
||||||
|
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
||||||
|
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||||
|
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||||
|
[encoder setBytes:&m0 length:sizeof( float) atIndex:18];
|
||||||
|
const int nth = 32;
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
|
} break;
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
{
|
{
|
||||||
if (encoder == nil) {
|
if (encoder == nil) {
|
||||||
|
@ -737,6 +908,14 @@ void ggml_metal_graph_compute(
|
||||||
default: GGML_ASSERT(false && "not implemented");
|
default: GGML_ASSERT(false && "not implemented");
|
||||||
};
|
};
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_TYPE_F16:
|
||||||
|
{
|
||||||
|
switch (dstt) {
|
||||||
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break;
|
||||||
|
case GGML_TYPE_F32: GGML_ASSERT(false && "cpy_f16_f32 not implemented"); break;
|
||||||
|
default: GGML_ASSERT(false && "not implemented");
|
||||||
|
};
|
||||||
|
} break;
|
||||||
default: GGML_ASSERT(false && "not implemented");
|
default: GGML_ASSERT(false && "not implemented");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -773,12 +952,21 @@ void ggml_metal_graph_compute(
|
||||||
}
|
}
|
||||||
|
|
||||||
[command_buffer commit];
|
[command_buffer commit];
|
||||||
[command_buffer waitUntilCompleted];
|
});
|
||||||
|
}
|
||||||
|
|
||||||
{
|
// wait for all threads to finish
|
||||||
const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
|
dispatch_barrier_sync(queue, ^{});
|
||||||
UNUSED(time_elapsed);
|
|
||||||
|
|
||||||
metal_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
|
[command_buffers[n_cb - 1] waitUntilCompleted];
|
||||||
|
|
||||||
|
// check status of command buffers
|
||||||
|
// needed to detect if the device ran out-of-memory for example (#1881)
|
||||||
|
for (int i = 0; i < n_cb; i++) {
|
||||||
|
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
|
||||||
|
if (status != MTLCommandBufferStatusCompleted) {
|
||||||
|
fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
698
ggml-metal.metal
698
ggml-metal.metal
|
@ -256,6 +256,72 @@ kernel void kernel_get_rows_q4_1(
|
||||||
(device float *) ((device char *) dst + i*nb1), ne00);
|
(device float *) ((device char *) dst + i*nb1), ne00);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_norm(
|
||||||
|
device const void * src0,
|
||||||
|
device float * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant uint64_t & nb01,
|
||||||
|
constant float & eps,
|
||||||
|
threadgroup float * sum [[threadgroup(0)]],
|
||||||
|
uint tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint ntg[[threads_per_threadgroup]]) {
|
||||||
|
device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01);
|
||||||
|
// MEAN
|
||||||
|
// parallel sum
|
||||||
|
sum[tpitg] = 0.0f;
|
||||||
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||||
|
sum[tpitg] += x[i00];
|
||||||
|
}
|
||||||
|
// reduce
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
for (uint i = ntg/2; i > 0; i /= 2) {
|
||||||
|
if (tpitg < i) {
|
||||||
|
sum[tpitg] += sum[tpitg + i];
|
||||||
|
}
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
}
|
||||||
|
// broadcast
|
||||||
|
if (tpitg == 0) {
|
||||||
|
sum[0] /= ne00;
|
||||||
|
}
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
const float mean = sum[0];
|
||||||
|
|
||||||
|
// recenter
|
||||||
|
device float * y = dst + tgpig*ne00;
|
||||||
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||||
|
y[i00] = x[i00] - mean;
|
||||||
|
}
|
||||||
|
|
||||||
|
// VARIANCE
|
||||||
|
// parallel sum
|
||||||
|
sum[tpitg] = 0.0f;
|
||||||
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||||
|
sum[tpitg] += y[i00] * y[i00];
|
||||||
|
}
|
||||||
|
// reduce
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
for (uint i = ntg/2; i > 0; i /= 2) {
|
||||||
|
if (tpitg < i) {
|
||||||
|
sum[tpitg] += sum[tpitg + i];
|
||||||
|
}
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
}
|
||||||
|
// broadcast
|
||||||
|
if (tpitg == 0) {
|
||||||
|
sum[0] /= ne00;
|
||||||
|
}
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
const float variance = sum[0];
|
||||||
|
|
||||||
|
const float scale = 1.0f/sqrt(variance + eps);
|
||||||
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||||
|
y[i00] = y[i00] * scale;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
kernel void kernel_rms_norm(
|
kernel void kernel_rms_norm(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -304,34 +370,22 @@ kernel void kernel_mul_mat_q4_0_f32(
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
|
||||||
constant uint64_t & nb00,
|
|
||||||
constant uint64_t & nb01,
|
|
||||||
constant uint64_t & nb02,
|
|
||||||
constant int64_t & ne10,
|
constant int64_t & ne10,
|
||||||
constant int64_t & ne11,
|
|
||||||
constant uint64_t & nb10,
|
|
||||||
constant uint64_t & nb11,
|
|
||||||
constant uint64_t & nb12,
|
|
||||||
constant int64_t & ne0,
|
constant int64_t & ne0,
|
||||||
constant int64_t & ne1,
|
|
||||||
threadgroup float * sum [[threadgroup(0)]],
|
threadgroup float * sum [[threadgroup(0)]],
|
||||||
uint2 tgpig[[threadgroup_position_in_grid]],
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
||||||
uint2 tpig[[thread_position_in_grid]],
|
|
||||||
uint2 tpitg[[thread_position_in_threadgroup]],
|
uint2 tpitg[[thread_position_in_threadgroup]],
|
||||||
uint2 tptg[[threads_per_threadgroup]]) {
|
uint2 tptg[[threads_per_threadgroup]]) {
|
||||||
const int nb = ne00/QK4_0;
|
const int nb = ne00/QK4_0;
|
||||||
|
|
||||||
const int8_t m8 = 8;
|
|
||||||
|
|
||||||
const int64_t r0 = tgpig.x;
|
const int64_t r0 = tgpig.x;
|
||||||
const int64_t r1 = tgpig.y;
|
const int64_t r1 = tgpig.y;
|
||||||
|
|
||||||
device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
|
device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
|
||||||
device const float * y = (device const float *) src1 + r1*ne10;
|
device const float * y = (device const float *) src1 + r1*ne10;
|
||||||
|
|
||||||
const uint nth = tptg.x*tptg.y;
|
const int nth = tptg.x*tptg.y;
|
||||||
const uint ith = tptg.y*tpitg.x + tpitg.y;
|
const int ith = tptg.y*tpitg.x + tpitg.y;
|
||||||
|
|
||||||
const int ix = tpitg.y/4; // 0 or 1
|
const int ix = tpitg.y/4; // 0 or 1
|
||||||
const int iy = tpitg.y - 4*ix; // 0...3
|
const int iy = tpitg.y - 4*ix; // 0...3
|
||||||
|
@ -351,47 +405,32 @@ kernel void kernel_mul_mat_q4_0_f32(
|
||||||
|
|
||||||
for (int j = 0; j < 4; ++j) {
|
for (int j = 0; j < 4; ++j) {
|
||||||
|
|
||||||
acc[0] += yl[j+ 0] * ((int8_t)(xl[j] & 0xF) - m8);
|
acc[0] += yl[j] * (xl[j] & 0xF) + yl[j+16] * (xl[j] >> 4);
|
||||||
acc[1] += yl[j+16] * ((int8_t)(xl[j] >> 4) - m8);
|
acc[1] += yl[j] + yl[j+16];
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
sumf += d * (acc[0] + acc[1]);
|
sumf += d * (acc[0] - 8.f*acc[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
sum[ith] = sumf;
|
sum[ith] = sumf;
|
||||||
|
|
||||||
//
|
//
|
||||||
// Accumulate the sum from all threads in the threadgroup
|
// Accumulate the sum from all threads in the threadgroup
|
||||||
// This version is slightly faster than the commented out one below,
|
|
||||||
// which I copy-pasted from ggerganov's q4_0 dot product for metal.
|
|
||||||
//
|
//
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
if (ith%4 == 0) {
|
if (ith%4 == 0) {
|
||||||
for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
|
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
|
||||||
}
|
}
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
if (ith%16 == 0) {
|
if (ith%16 == 0) {
|
||||||
for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
|
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
|
||||||
}
|
}
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
if (ith == 0) {
|
if (ith == 0) {
|
||||||
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
|
||||||
dst[r1*ne0 + r0] = sum[0];
|
dst[r1*ne0 + r0] = sum[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
//// accumulate the sum from all threads in the threadgroup
|
|
||||||
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
//for (uint i = nth/2; i > 0; i /= 2) {
|
|
||||||
// if (ith < i) {
|
|
||||||
// sum[ith] += sum[ith + i];
|
|
||||||
// }
|
|
||||||
// threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
//}
|
|
||||||
|
|
||||||
//if (ith == 0) {
|
|
||||||
// dst[r1*ne0 + r0] = sum[0];
|
|
||||||
//}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_mul_mat_q4_1_f32(
|
kernel void kernel_mul_mat_q4_1_f32(
|
||||||
|
@ -399,20 +438,10 @@ kernel void kernel_mul_mat_q4_1_f32(
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
|
||||||
constant uint64_t & nb00,
|
|
||||||
constant uint64_t & nb01,
|
|
||||||
constant uint64_t & nb02,
|
|
||||||
constant int64_t & ne10,
|
constant int64_t & ne10,
|
||||||
constant int64_t & ne11,
|
|
||||||
constant uint64_t & nb10,
|
|
||||||
constant uint64_t & nb11,
|
|
||||||
constant uint64_t & nb12,
|
|
||||||
constant int64_t & ne0,
|
constant int64_t & ne0,
|
||||||
constant int64_t & ne1,
|
|
||||||
threadgroup float * sum [[threadgroup(0)]],
|
threadgroup float * sum [[threadgroup(0)]],
|
||||||
uint2 tgpig[[threadgroup_position_in_grid]],
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
||||||
uint2 tpig[[thread_position_in_grid]],
|
|
||||||
uint2 tpitg[[thread_position_in_threadgroup]],
|
uint2 tpitg[[thread_position_in_threadgroup]],
|
||||||
uint2 tptg[[threads_per_threadgroup]]) {
|
uint2 tptg[[threads_per_threadgroup]]) {
|
||||||
const int nb = ne00/QK4_1;
|
const int nb = ne00/QK4_1;
|
||||||
|
@ -460,11 +489,11 @@ kernel void kernel_mul_mat_q4_1_f32(
|
||||||
//
|
//
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
if (ith%4 == 0) {
|
if (ith%4 == 0) {
|
||||||
for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
|
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
|
||||||
}
|
}
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
if (ith%16 == 0) {
|
if (ith%16 == 0) {
|
||||||
for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
|
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
|
||||||
}
|
}
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
if (ith == 0) {
|
if (ith == 0) {
|
||||||
|
@ -522,6 +551,48 @@ kernel void kernel_mul_mat_f16_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_alibi_f32(
|
||||||
|
device const float * src0,
|
||||||
|
device float * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant int64_t & ne01,
|
||||||
|
constant int64_t & ne02,
|
||||||
|
constant int64_t & ne03,
|
||||||
|
constant uint64_t & nb00,
|
||||||
|
constant uint64_t & nb01,
|
||||||
|
constant uint64_t & nb02,
|
||||||
|
constant uint64_t & nb03,
|
||||||
|
constant int64_t & ne0,
|
||||||
|
constant int64_t & ne1,
|
||||||
|
constant int64_t & ne2,
|
||||||
|
constant int64_t & ne3,
|
||||||
|
constant uint64_t & nb0,
|
||||||
|
constant uint64_t & nb1,
|
||||||
|
constant uint64_t & nb2,
|
||||||
|
constant uint64_t & nb3,
|
||||||
|
constant float & m0,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
const int64_t i03 = tgpig[2];
|
||||||
|
const int64_t i02 = tgpig[1];
|
||||||
|
const int64_t i01 = tgpig[0];
|
||||||
|
|
||||||
|
const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||||
|
|
||||||
|
const int64_t i3 = n / (ne2*ne1*ne0);
|
||||||
|
const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
|
||||||
|
const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
|
||||||
|
const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
|
||||||
|
|
||||||
|
device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
float m_k = pow(m0, i2 + 1);
|
||||||
|
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
|
||||||
|
device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||||
|
dst_data[i00] = src[0] + m_k * (i00 - ne00 + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
kernel void kernel_rope(
|
kernel void kernel_rope(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -577,6 +648,47 @@ kernel void kernel_rope(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_cpy_f16_f16(
|
||||||
|
device const half * src0,
|
||||||
|
device half * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant int64_t & ne01,
|
||||||
|
constant int64_t & ne02,
|
||||||
|
constant int64_t & ne03,
|
||||||
|
constant uint64_t & nb00,
|
||||||
|
constant uint64_t & nb01,
|
||||||
|
constant uint64_t & nb02,
|
||||||
|
constant uint64_t & nb03,
|
||||||
|
constant int64_t & ne0,
|
||||||
|
constant int64_t & ne1,
|
||||||
|
constant int64_t & ne2,
|
||||||
|
constant int64_t & ne3,
|
||||||
|
constant uint64_t & nb0,
|
||||||
|
constant uint64_t & nb1,
|
||||||
|
constant uint64_t & nb2,
|
||||||
|
constant uint64_t & nb3,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
const int64_t i03 = tgpig[2];
|
||||||
|
const int64_t i02 = tgpig[1];
|
||||||
|
const int64_t i01 = tgpig[0];
|
||||||
|
|
||||||
|
const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||||
|
|
||||||
|
const int64_t i3 = n / (ne2*ne1*ne0);
|
||||||
|
const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
|
||||||
|
const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
|
||||||
|
const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
|
||||||
|
|
||||||
|
device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
|
||||||
|
device const half * src = (device half *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||||
|
dst_data[i00] = src[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
kernel void kernel_cpy_f32_f16(
|
kernel void kernel_cpy_f32_f16(
|
||||||
device const float * src0,
|
device const float * src0,
|
||||||
device half * dst,
|
device half * dst,
|
||||||
|
@ -671,6 +783,15 @@ typedef struct {
|
||||||
half d; // super-block scale for quantized scales
|
half d; // super-block scale for quantized scales
|
||||||
half dmin; // super-block scale for quantized mins
|
half dmin; // super-block scale for quantized mins
|
||||||
} block_q2_k;
|
} block_q2_k;
|
||||||
|
// 84 bytes / block
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
||||||
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||||
|
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
||||||
|
half d; // super-block scale
|
||||||
|
} block_q3_k;
|
||||||
|
// 110 bytes / block
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d; // super-block scale for quantized scales
|
half d; // super-block scale for quantized scales
|
||||||
|
@ -678,6 +799,16 @@ typedef struct {
|
||||||
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
||||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||||
} block_q4_k;
|
} block_q4_k;
|
||||||
|
// 144 bytes / block
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
half d; // super-block scale for quantized scales
|
||||||
|
half dmin; // super-block scale for quantized mins
|
||||||
|
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
||||||
|
uint8_t qh[QK_K/8]; // quants, high bit
|
||||||
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||||
|
} block_q5_k;
|
||||||
|
// 176 bytes / block
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
||||||
|
@ -685,16 +816,19 @@ typedef struct {
|
||||||
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
||||||
half d; // super-block scale
|
half d; // super-block scale
|
||||||
} block_q6_k;
|
} block_q6_k;
|
||||||
|
// 210 bytes / block
|
||||||
|
|
||||||
static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
|
static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
|
||||||
uchar4 r;
|
uchar4 r;
|
||||||
if (j < 4) {
|
if (j < 4) {
|
||||||
r[0] = q[j+0] & 63; r[1] = q[j+4] & 63;
|
r[0] = q[j+0] & 63;
|
||||||
r[2] = q[j+1] & 63; r[3] = q[j+5] & 63;
|
r[2] = q[j+1] & 63;
|
||||||
|
r[1] = q[j+4] & 63;
|
||||||
|
r[3] = q[j+5] & 63;
|
||||||
} else {
|
} else {
|
||||||
r[0] = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
|
r[0] = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
|
||||||
r[1] = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
|
||||||
r[2] = (q[j+5] & 0xF) | ((q[j-3] >> 6) << 4);
|
r[2] = (q[j+5] & 0xF) | ((q[j-3] >> 6) << 4);
|
||||||
|
r[1] = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
||||||
r[3] = (q[j+5] >> 4) | ((q[j+1] >> 6) << 4);
|
r[3] = (q[j+5] >> 4) | ((q[j+1] >> 6) << 4);
|
||||||
}
|
}
|
||||||
return r;
|
return r;
|
||||||
|
@ -735,10 +869,65 @@ static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, i
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void dequantize_row_q3_k(device const block_q3_k * x, device float * y, int k) {
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
const uint16_t kmask1 = 0x0303;
|
||||||
|
const uint16_t kmask2 = 0x0f0f;
|
||||||
|
|
||||||
|
uint16_t aux[8];
|
||||||
|
thread const int8_t * scales = (thread const int8_t*)aux;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
|
||||||
|
const float d_all = (float)(x[i].d);
|
||||||
|
|
||||||
|
device const uint8_t * q = x[i].qs;
|
||||||
|
device const uint8_t * h = x[i].hmask;
|
||||||
|
uint8_t m = 1;
|
||||||
|
|
||||||
|
device const uint16_t * a = (device const uint16_t *)x[i].scales;
|
||||||
|
aux[0] = (a[0] & kmask2) | (((a[4] >> 0) & kmask1) << 4);
|
||||||
|
aux[1] = (a[1] & kmask2) | (((a[5] >> 0) & kmask1) << 4);
|
||||||
|
aux[2] = (a[2] & kmask2) | (((a[4] >> 2) & kmask1) << 4);
|
||||||
|
aux[3] = (a[3] & kmask2) | (((a[5] >> 2) & kmask1) << 4);
|
||||||
|
aux[4] = ((a[0] >> 4) & kmask2) | (((a[4] >> 4) & kmask1) << 4);
|
||||||
|
aux[5] = ((a[1] >> 4) & kmask2) | (((a[5] >> 4) & kmask1) << 4);
|
||||||
|
aux[6] = ((a[2] >> 4) & kmask2) | (((a[4] >> 6) & kmask1) << 4);
|
||||||
|
aux[7] = ((a[3] >> 4) & kmask2) | (((a[5] >> 6) & kmask1) << 4);
|
||||||
|
|
||||||
|
int is = 0;
|
||||||
|
float dl;
|
||||||
|
for (int n = 0; n < QK_K; n += 128) {
|
||||||
|
int shift = 0;
|
||||||
|
for (int j = 0; j < 4; ++j) {
|
||||||
|
|
||||||
|
dl = d_all * (scales[is++] - 32);
|
||||||
|
for (int l = 0; l < 16; ++l) {
|
||||||
|
*y++ = dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((h[l+ 0] & m) ? 0 : 4));
|
||||||
|
}
|
||||||
|
|
||||||
|
dl = d_all * (scales[is++] - 32);
|
||||||
|
for (int l = 0; l < 16; ++l) {
|
||||||
|
*y++ = dl * ((int8_t)((q[l+16] >> shift) & 3) - ((h[l+16] & m) ? 0 : 4));
|
||||||
|
}
|
||||||
|
|
||||||
|
shift += 2;
|
||||||
|
m <<= 1;
|
||||||
|
}
|
||||||
|
q += 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, int k) {
|
static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
|
|
||||||
const float d = x[i].d;
|
const float d = x[i].d;
|
||||||
|
@ -760,6 +949,33 @@ static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, i
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void dequantize_row_q5_k(device const block_q5_k * x, device float * y, int k) {
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
|
||||||
|
const float d = (float)(x[i].d);
|
||||||
|
const float min = (float)(x[i].dmin);
|
||||||
|
|
||||||
|
device const uint8_t * ql = x[i].qs;
|
||||||
|
device const uint8_t * qh = x[i].qh;
|
||||||
|
|
||||||
|
int is = 0;
|
||||||
|
uint8_t u1 = 1, u2 = 2;
|
||||||
|
for (int j = 0; j < QK_K; j += 64) {
|
||||||
|
const uchar4 sc = get_scale_min_k4(is, x[i].scales);
|
||||||
|
const float d1 = d * sc[0]; const float m1 = min * sc[1];
|
||||||
|
const float d2 = d * sc[2]; const float m2 = min * sc[3];
|
||||||
|
for (int l = 0; l < 32; ++l) *y++ = d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1;
|
||||||
|
for (int l = 0; l < 32; ++l) *y++ = d2 * ((ql[l] >> 4) + (qh[l] & u2 ? 16 : 0)) - m2;
|
||||||
|
ql += 32; is += 2;
|
||||||
|
u1 <<= 2; u2 <<= 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, int k) {
|
static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
@ -808,6 +1024,22 @@ kernel void kernel_get_rows_q2_k(
|
||||||
(device float *) ((device char *) dst + i*nb1), ne00);
|
(device float *) ((device char *) dst + i*nb1), ne00);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_get_rows_q3_k(
|
||||||
|
device const void * src0,
|
||||||
|
device const int * src1,
|
||||||
|
device float * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant uint64_t & nb01,
|
||||||
|
constant uint64_t & nb1,
|
||||||
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
|
const int i = tpig;
|
||||||
|
const int r = ((device int32_t *) src1)[i];
|
||||||
|
|
||||||
|
dequantize_row_q3_k(
|
||||||
|
(device const block_q3_k *) ((device char *) src0 + r*nb01),
|
||||||
|
(device float *) ((device char *) dst + i*nb1), ne00);
|
||||||
|
}
|
||||||
|
|
||||||
kernel void kernel_get_rows_q4_k(
|
kernel void kernel_get_rows_q4_k(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const int * src1,
|
device const int * src1,
|
||||||
|
@ -824,6 +1056,22 @@ kernel void kernel_get_rows_q4_k(
|
||||||
(device float *) ((device char *) dst + i*nb1), ne00);
|
(device float *) ((device char *) dst + i*nb1), ne00);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_get_rows_q5_k(
|
||||||
|
device const void * src0,
|
||||||
|
device const int * src1,
|
||||||
|
device float * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant uint64_t & nb01,
|
||||||
|
constant uint64_t & nb1,
|
||||||
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
|
const int i = tpig;
|
||||||
|
const int r = ((device int32_t *) src1)[i];
|
||||||
|
|
||||||
|
dequantize_row_q5_k(
|
||||||
|
(device const block_q5_k *) ((device char *) src0 + r*nb01),
|
||||||
|
(device float *) ((device char *) dst + i*nb1), ne00);
|
||||||
|
}
|
||||||
|
|
||||||
kernel void kernel_get_rows_q6_k(
|
kernel void kernel_get_rows_q6_k(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const int * src1,
|
device const int * src1,
|
||||||
|
@ -847,20 +1095,10 @@ kernel void kernel_mul_mat_q2_k_f32(
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
|
||||||
constant uint64_t & nb00,
|
|
||||||
constant uint64_t & nb01,
|
|
||||||
constant uint64_t & nb02,
|
|
||||||
constant int64_t & ne10,
|
constant int64_t & ne10,
|
||||||
constant int64_t & ne11,
|
|
||||||
constant uint64_t & nb10,
|
|
||||||
constant uint64_t & nb11,
|
|
||||||
constant uint64_t & nb12,
|
|
||||||
constant int64_t & ne0,
|
constant int64_t & ne0,
|
||||||
constant int64_t & ne1,
|
|
||||||
threadgroup float * sum [[threadgroup(0)]],
|
threadgroup float * sum [[threadgroup(0)]],
|
||||||
uint2 tgpig[[threadgroup_position_in_grid]],
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
||||||
uint2 tpig[[thread_position_in_grid]], // we don't use this for now
|
|
||||||
uint2 tpitg[[thread_position_in_threadgroup]],
|
uint2 tpitg[[thread_position_in_threadgroup]],
|
||||||
uint2 tptg[[threads_per_threadgroup]]) {
|
uint2 tptg[[threads_per_threadgroup]]) {
|
||||||
|
|
||||||
|
@ -875,7 +1113,6 @@ kernel void kernel_mul_mat_q2_k_f32(
|
||||||
const int nth = tptg.x*tptg.y;
|
const int nth = tptg.x*tptg.y;
|
||||||
const int ith = tptg.y*tpitg.x + tpitg.y;
|
const int ith = tptg.y*tpitg.x + tpitg.y;
|
||||||
|
|
||||||
|
|
||||||
const int tid = tpitg.y; // 0...16
|
const int tid = tpitg.y; // 0...16
|
||||||
const int il = tid/4; // 0...3
|
const int il = tid/4; // 0...3
|
||||||
const int ir = tid%4; // 0...3
|
const int ir = tid%4; // 0...3
|
||||||
|
@ -885,35 +1122,54 @@ kernel void kernel_mul_mat_q2_k_f32(
|
||||||
const int n = 8;
|
const int n = 8;
|
||||||
const int is = 4*il + (n*ir)/16;
|
const int is = 4*il + (n*ir)/16;
|
||||||
|
|
||||||
|
const int y_offset = 64*il + n*ir;
|
||||||
|
const int q_offset = 32*ip + n*ir;
|
||||||
|
|
||||||
sum[ith] = 0.0f;
|
sum[ith] = 0.0f;
|
||||||
|
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
||||||
|
|
||||||
device const uint8_t * q = x[i].qs + 32*ip + n*ir;
|
device const uint8_t * q = x[i].qs + q_offset;
|
||||||
device const uint8_t * scales = x[i].scales + is;
|
device const uint8_t * scales = x[i].scales + is;
|
||||||
|
|
||||||
uint8_t d1 = scales[0] & 0xF;
|
uint8_t d1 = scales[0] & 0xF;
|
||||||
uint8_t m1 = scales[0] >> 4;
|
|
||||||
uint8_t d2 = scales[2] & 0xF;
|
uint8_t d2 = scales[2] & 0xF;
|
||||||
|
uint8_t m1 = scales[0] >> 4;
|
||||||
uint8_t m2 = scales[2] >> 4;
|
uint8_t m2 = scales[2] >> 4;
|
||||||
|
|
||||||
device const float * y = yy + i*QK_K + 64*il + n*ir;
|
device const float * y = yy + i*QK_K + y_offset;
|
||||||
|
|
||||||
|
//float4 s = {0.f, 0.f, 0.f, 0.f};
|
||||||
|
float2 s = {0.f, 0.f};
|
||||||
|
float smin = 0;
|
||||||
|
for (int l = 0; l < n; ++l) {
|
||||||
|
s[0] += y[l+ 0] * ((q[l] >> shift1) & 3);
|
||||||
|
s[1] += y[l+32] * ((q[l] >> shift2) & 3);
|
||||||
|
smin += y[l+ 0] * m1 + y[l+32] * m2;
|
||||||
|
}
|
||||||
|
|
||||||
const float dall = (float)x[i].d;
|
const float dall = (float)x[i].d;
|
||||||
const float dmin = (float)x[i].dmin;
|
const float dmin = (float)x[i].dmin;
|
||||||
|
|
||||||
float4 s = {0.f, 0.f, 0.f, 0.f};
|
sumf += dall * (s[0] * d1 + s[1] * d2) - dmin * smin;
|
||||||
for (int l = 0; l < n; ++l) {
|
|
||||||
s[0] += y[l+ 0] * ((q[l] >> shift1) & 3); s[1] += y[l+ 0];
|
|
||||||
s[2] += y[l+32] * ((q[l] >> shift2) & 3); s[3] += y[l+32];
|
|
||||||
}
|
|
||||||
sumf += dall * (s[0] * d1 + s[2] * d2) - dmin * (s[1] * m1 + s[3] * m2);
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
sum[ith] = sumf;
|
sum[ith] = sumf;
|
||||||
|
|
||||||
|
//int mask1 = (ith%4 == 0);
|
||||||
|
//int mask2 = (ith%16 == 0);
|
||||||
|
|
||||||
|
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
//for (int i = 1; i < 4; ++i) sum[ith] += mask1 * sum[ith + i];
|
||||||
|
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
//for (int i = 4; i < 16; i += 4) sum[ith] += mask2 * sum[ith + i];
|
||||||
|
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
//if (ith == 0) {
|
||||||
|
// for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
||||||
|
// dst[r1*ne0 + r0] = sum[0];
|
||||||
|
//}
|
||||||
|
|
||||||
//
|
//
|
||||||
// Accumulate the sum from all threads in the threadgroup
|
// Accumulate the sum from all threads in the threadgroup
|
||||||
// This version is slightly faster than the commented out one below,
|
// This version is slightly faster than the commented out one below,
|
||||||
|
@ -932,19 +1188,109 @@ kernel void kernel_mul_mat_q2_k_f32(
|
||||||
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
||||||
dst[r1*ne0 + r0] = sum[0];
|
dst[r1*ne0 + r0] = sum[0];
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//// accumulate the sum from all threads in the threadgroup
|
kernel void kernel_mul_mat_q3_k_f32(
|
||||||
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
device const void * src0,
|
||||||
//for (uint i = nth/2; i > 0; i /= 2) {
|
device const float * src1,
|
||||||
// if (ith < i) {
|
device float * dst,
|
||||||
// sum[ith] += sum[ith + i];
|
constant int64_t & ne00,
|
||||||
// }
|
constant int64_t & ne10,
|
||||||
// threadgroup_barrier(mem_flags::mem_threadgroup);
|
constant int64_t & ne0,
|
||||||
//}
|
constant int64_t & ne1,
|
||||||
|
threadgroup float * sum [[threadgroup(0)]],
|
||||||
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint2 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint2 tptg[[threads_per_threadgroup]]) {
|
||||||
|
|
||||||
|
const uint16_t kmask1 = 0x0303;
|
||||||
|
const uint16_t kmask2 = 0x0f0f;
|
||||||
|
|
||||||
|
const uint8_t m3 = 3;
|
||||||
|
const int8_t m4 = 4;
|
||||||
|
|
||||||
|
const int nb = ne00/QK_K;
|
||||||
|
|
||||||
|
const int64_t r0 = tgpig.x;
|
||||||
|
const int64_t r1 = tgpig.y;
|
||||||
|
|
||||||
|
device const block_q3_k * x = (device const block_q3_k *) src0 + r0*nb;
|
||||||
|
device const float * yy = (device const float *) src1 + r1*ne10;
|
||||||
|
|
||||||
|
const int nth = tptg.x*tptg.y;
|
||||||
|
const int ith = tptg.y*tpitg.x + tpitg.y;
|
||||||
|
|
||||||
|
const int tid = tpitg.y; // expecting 16
|
||||||
|
const int ip = tid/8; // 0 or 1
|
||||||
|
const int il = tid/2 - 4*ip; // 0...3
|
||||||
|
const int ir = tid%2;
|
||||||
|
const int n = 8;
|
||||||
|
const int l0 = n*ir;
|
||||||
|
|
||||||
|
const uint8_t m = 1 << (4*ip + il);
|
||||||
|
|
||||||
|
const int shift = 2*il;
|
||||||
|
|
||||||
|
const uint16_t s_shift1 = 4*ip;
|
||||||
|
const uint16_t s_shift2 = s_shift1 + 2*(il/2);
|
||||||
|
const int ik = 4 + (il%2);
|
||||||
|
|
||||||
|
const int q_offset = 32*ip + l0;
|
||||||
|
const int y_offset = 128*ip + 32*il + l0;
|
||||||
|
|
||||||
|
//float sumf = 0;
|
||||||
|
float sumf1 = 0, sumf2 = 0;
|
||||||
|
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
||||||
|
|
||||||
|
const float d_all = (float)(x[i].d);
|
||||||
|
|
||||||
|
device const uint8_t * q = x[i].qs + q_offset;
|
||||||
|
device const uint8_t * h = x[i].hmask + l0;
|
||||||
|
device const float * y = yy + i * QK_K + y_offset;
|
||||||
|
|
||||||
|
device const uint16_t * a = (device const uint16_t *)x[i].scales;
|
||||||
|
const char2 scales = as_type<char2>((uint16_t)(((a[il] >> s_shift1) & kmask2) | (((a[ik] >> s_shift2) & kmask1) << 4)));
|
||||||
|
|
||||||
|
float s = 0;
|
||||||
|
for (int l = 0; l < n; ++l) {
|
||||||
|
s += y[l+ 0] * ((int8_t)((q[l+ 0] >> shift) & m3) - ((h[l+ 0] & m) ? 0 : m4));
|
||||||
|
}
|
||||||
|
float d = d_all * s;
|
||||||
|
sumf1 += d * scales[0];
|
||||||
|
sumf2 += d;
|
||||||
|
//sumf += d_all * s * (scales[0] - 32);
|
||||||
|
|
||||||
|
s = 0;
|
||||||
|
for (int l = 0; l < n; ++l) {
|
||||||
|
s += y[l+16] * ((int8_t)((q[l+16] >> shift) & m3) - ((h[l+16] & m) ? 0 : m4));
|
||||||
|
}
|
||||||
|
d = d_all * s;
|
||||||
|
sumf1 += d * scales[1];
|
||||||
|
sumf2 += d;
|
||||||
|
//sumf += d_all * s * (scales[1] - 32);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//sum[ith] = sumf;
|
||||||
|
sum[ith] = sumf1 - 32.f*sumf2;
|
||||||
|
|
||||||
|
//
|
||||||
|
// Accumulate the sum from all threads in the threadgroup
|
||||||
|
//
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
if (ith%4 == 0) {
|
||||||
|
for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
|
||||||
|
}
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
if (ith%16 == 0) {
|
||||||
|
for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
|
||||||
|
}
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
if (ith == 0) {
|
||||||
|
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
||||||
|
dst[r1*ne0 + r0] = sum[0];
|
||||||
|
}
|
||||||
|
|
||||||
//if (ith == 0) {
|
|
||||||
// dst[r1*ne0 + r0] = sum[0];
|
|
||||||
//}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_mul_mat_q4_k_f32(
|
kernel void kernel_mul_mat_q4_k_f32(
|
||||||
|
@ -952,23 +1298,17 @@ kernel void kernel_mul_mat_q4_k_f32(
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
|
||||||
constant uint64_t & nb00,
|
|
||||||
constant uint64_t & nb01,
|
|
||||||
constant uint64_t & nb02,
|
|
||||||
constant int64_t & ne10,
|
constant int64_t & ne10,
|
||||||
constant int64_t & ne11,
|
|
||||||
constant uint64_t & nb10,
|
|
||||||
constant uint64_t & nb11,
|
|
||||||
constant uint64_t & nb12,
|
|
||||||
constant int64_t & ne0,
|
constant int64_t & ne0,
|
||||||
constant int64_t & ne1,
|
|
||||||
threadgroup float * sum [[threadgroup(0)]],
|
threadgroup float * sum [[threadgroup(0)]],
|
||||||
uint2 tgpig[[threadgroup_position_in_grid]],
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
||||||
uint2 tpig[[thread_position_in_grid]], // we don't use this for now
|
|
||||||
uint2 tpitg[[thread_position_in_threadgroup]],
|
uint2 tpitg[[thread_position_in_threadgroup]],
|
||||||
uint2 tptg[[threads_per_threadgroup]]) {
|
uint2 tptg[[threads_per_threadgroup]]) {
|
||||||
|
|
||||||
|
const uint16_t kmask1 = 0x3f3f;
|
||||||
|
const uint16_t kmask2 = 0x0f0f;
|
||||||
|
const uint16_t kmask3 = 0xc0c0;
|
||||||
|
|
||||||
const int nb = ne00/QK_K;
|
const int nb = ne00/QK_K;
|
||||||
|
|
||||||
const int64_t r0 = tgpig.x;
|
const int64_t r0 = tgpig.x;
|
||||||
|
@ -977,37 +1317,55 @@ kernel void kernel_mul_mat_q4_k_f32(
|
||||||
device const block_q4_k * x = (device const block_q4_k *) src0 + r0*nb;
|
device const block_q4_k * x = (device const block_q4_k *) src0 + r0*nb;
|
||||||
device const float * yy = (device const float *) src1 + r1*ne10;
|
device const float * yy = (device const float *) src1 + r1*ne10;
|
||||||
|
|
||||||
const uint nth = tptg.x*tptg.y;
|
const int nth = tptg.x*tptg.y;
|
||||||
const uint ith = tptg.y*tpitg.x + tpitg.y;
|
const int ith = tptg.y*tpitg.x + tpitg.y;
|
||||||
|
|
||||||
const int tid = tpitg.y; // 0...16
|
const int tid = tpitg.y; // 0...16
|
||||||
const int il = tid/4; // 0...3
|
const int il = tid/4; // 0...3
|
||||||
const int ir = tid%4; // 0...3
|
const int ir = tid - 4*il;// 0...3
|
||||||
const int n = 8;
|
const int n = 4;
|
||||||
const int is = 2*il;
|
|
||||||
|
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
||||||
|
const int in = il%2;
|
||||||
|
|
||||||
|
const int l0 = n*(2*ir + in);
|
||||||
|
const int q_offset = 32*im + l0;
|
||||||
|
const int y_offset = 64*im + l0;
|
||||||
|
|
||||||
sum[ith] = 0.0f;
|
sum[ith] = 0.0f;
|
||||||
|
|
||||||
|
uchar2 sc1, sc2, sc3, sc4;
|
||||||
|
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
||||||
|
|
||||||
device const uint8_t * q = (x + i)->qs + 32*il + n*ir;
|
device const uint8_t * q1 = (x + i)->qs + q_offset;
|
||||||
device const float * y = yy + i*QK_K + 64*il + n*ir;
|
device const uint8_t * q2 = q1 + 64;
|
||||||
device const uint8_t * scales = (x + i)->scales;
|
device const float * y1 = yy + i*QK_K + y_offset;
|
||||||
|
device const float * y2 = y1 + 128;
|
||||||
|
|
||||||
const float dall = (float)((x + i)->d);
|
const float dall = (float)((x + i)->d);
|
||||||
const float dmin = (float)((x + i)->dmin);
|
const float dmin = (float)((x + i)->dmin);
|
||||||
|
|
||||||
const uchar4 sc = get_scale_min_k4(is, scales);
|
device const uint16_t * a = (device const uint16_t *)(x + i)->scales;
|
||||||
|
sc1 = as_type<uchar2>((uint16_t)(a[im+0] & kmask1));
|
||||||
|
sc2 = as_type<uchar2>((uint16_t)(a[im+2] & kmask1));
|
||||||
|
sc3 = as_type<uchar2>((uint16_t)(((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2)));
|
||||||
|
sc4 = as_type<uchar2>((uint16_t)(((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2)));
|
||||||
|
|
||||||
float4 s = {0.f, 0.f, 0.f, 0.f};
|
float4 s = {0.f, 0.f, 0.f, 0.f};
|
||||||
|
float smin = 0;
|
||||||
for (int l = 0; l < n; ++l) {
|
for (int l = 0; l < n; ++l) {
|
||||||
s[0] += y[l+ 0] * (q[l] & 0xF); s[1] += y[l+ 0];
|
|
||||||
s[2] += y[l+32] * (q[l] >> 4); s[3] += y[l+32];
|
s[0] += y1[l] * (q1[l] & 0xF); s[1] += y1[l+32] * (q1[l] >> 4);
|
||||||
}
|
s[2] += y2[l] * (q2[l] & 0xF); s[3] += y2[l+32] * (q2[l] >> 4);
|
||||||
sumf += dall * (s[0] * sc[0] + s[2] * sc[2]) - dmin * (s[1] * sc[1] + s[3] * sc[3]);
|
smin += y1[l] * sc2[0] + y1[l+32] * sc2[1] + y2[l] * sc4[0] + y2[l+32] * sc4[1];
|
||||||
|
|
||||||
}
|
}
|
||||||
|
sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
sum[ith] = sumf;
|
sum[ith] = sumf;
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -1043,25 +1401,114 @@ kernel void kernel_mul_mat_q4_k_f32(
|
||||||
//}
|
//}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_mul_mat_q5_k_f32(
|
||||||
|
device const void * src0,
|
||||||
|
device const float * src1,
|
||||||
|
device float * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant int64_t & ne10,
|
||||||
|
constant int64_t & ne0,
|
||||||
|
threadgroup float * sum [[threadgroup(0)]],
|
||||||
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint2 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint2 tptg[[threads_per_threadgroup]]) {
|
||||||
|
|
||||||
|
const uint16_t kmask1 = 0x3f3f;
|
||||||
|
const uint16_t kmask2 = 0x0f0f;
|
||||||
|
const uint16_t kmask3 = 0xc0c0;
|
||||||
|
|
||||||
|
const int nb = ne00/QK_K;
|
||||||
|
|
||||||
|
const int64_t r0 = tgpig.x;
|
||||||
|
const int64_t r1 = tgpig.y;
|
||||||
|
|
||||||
|
device const block_q5_k * x = (device const block_q5_k *) src0 + r0*nb;
|
||||||
|
device const float * yy = (device const float *) src1 + r1*ne10;
|
||||||
|
|
||||||
|
const int nth = tptg.x*tptg.y;
|
||||||
|
const int ith = tptg.y*tpitg.x + tpitg.y;
|
||||||
|
|
||||||
|
const int tid = tpitg.y; // 0...16
|
||||||
|
const int il = tid/4; // 0...3
|
||||||
|
const int ir = tid - 4*il;// 0...3
|
||||||
|
const int n = 4;
|
||||||
|
|
||||||
|
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
||||||
|
const int in = il%2;
|
||||||
|
|
||||||
|
const int l0 = n*(2*ir + in);
|
||||||
|
const int q_offset = 32*im + l0;
|
||||||
|
const int y_offset = 64*im + l0;
|
||||||
|
|
||||||
|
const uint8_t hm1 = 1u << (2*im);
|
||||||
|
const uint8_t hm2 = hm1 << 1;
|
||||||
|
const uint8_t hm3 = hm1 << 4;
|
||||||
|
const uint8_t hm4 = hm2 << 4;
|
||||||
|
|
||||||
|
uchar2 sc1, sc2, sc3, sc4;
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
||||||
|
|
||||||
|
device const uint8_t * q1 = (x + i)->qs + q_offset;
|
||||||
|
device const uint8_t * q2 = q1 + 64;
|
||||||
|
device const uint8_t * qh = (x + i)->qh + l0;
|
||||||
|
device const float * y1 = yy + i*QK_K + y_offset;
|
||||||
|
device const float * y2 = y1 + 128;
|
||||||
|
|
||||||
|
const float dall = (float)((x + i)->d);
|
||||||
|
const float dmin = (float)((x + i)->dmin);
|
||||||
|
|
||||||
|
device const uint16_t * a = (device const uint16_t *)(x + i)->scales;
|
||||||
|
sc1 = as_type<uchar2>((uint16_t)(a[im+0] & kmask1));
|
||||||
|
sc2 = as_type<uchar2>((uint16_t)(a[im+2] & kmask1));
|
||||||
|
sc3 = as_type<uchar2>((uint16_t)(((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2)));
|
||||||
|
sc4 = as_type<uchar2>((uint16_t)(((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2)));
|
||||||
|
|
||||||
|
float4 s = {0.f, 0.f, 0.f, 0.f};
|
||||||
|
float smin = 0;
|
||||||
|
for (int l = 0; l < n; ++l) {
|
||||||
|
|
||||||
|
s[0] += y1[l+ 0] * ((q1[l] & 0xF) + (qh[l] & hm1 ? 16 : 0));
|
||||||
|
s[1] += y1[l+32] * ((q1[l] >> 4) + (qh[l] & hm2 ? 16 : 0));
|
||||||
|
s[2] += y2[l+ 0] * ((q2[l] & 0xF) + (qh[l] & hm3 ? 16 : 0));
|
||||||
|
s[3] += y2[l+32] * ((q2[l] >> 4) + (qh[l] & hm4 ? 16 : 0));
|
||||||
|
smin += y1[l] * sc2[0] + y1[l+32] * sc2[1] + y2[l] * sc4[0] + y2[l+32] * sc4[1];
|
||||||
|
|
||||||
|
}
|
||||||
|
sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
|
||||||
|
|
||||||
|
}
|
||||||
|
sum[ith] = sumf;
|
||||||
|
|
||||||
|
//
|
||||||
|
// Accumulate the sum from all threads in the threadgroup
|
||||||
|
//
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
if (ith%4 == 0) {
|
||||||
|
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
|
||||||
|
}
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
if (ith%16 == 0) {
|
||||||
|
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
|
||||||
|
}
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
if (ith == 0) {
|
||||||
|
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
||||||
|
dst[r1*ne0 + r0] = sum[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
kernel void kernel_mul_mat_q6_k_f32(
|
kernel void kernel_mul_mat_q6_k_f32(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
|
||||||
constant uint64_t & nb00,
|
|
||||||
constant uint64_t & nb01,
|
|
||||||
constant uint64_t & nb02,
|
|
||||||
constant int64_t & ne10,
|
constant int64_t & ne10,
|
||||||
constant int64_t & ne11,
|
|
||||||
constant uint64_t & nb10,
|
|
||||||
constant uint64_t & nb11,
|
|
||||||
constant uint64_t & nb12,
|
|
||||||
constant int64_t & ne0,
|
constant int64_t & ne0,
|
||||||
constant int64_t & ne1,
|
|
||||||
threadgroup float * sum [[threadgroup(0)]],
|
threadgroup float * sum [[threadgroup(0)]],
|
||||||
uint2 tgpig[[threadgroup_position_in_grid]],
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
||||||
uint2 tpig[[thread_position_in_grid]], // we don't use this for now
|
|
||||||
uint2 tpitg[[thread_position_in_threadgroup]],
|
uint2 tpitg[[thread_position_in_threadgroup]],
|
||||||
uint2 tptg[[threads_per_threadgroup]]) {
|
uint2 tptg[[threads_per_threadgroup]]) {
|
||||||
|
|
||||||
|
@ -1078,24 +1525,29 @@ kernel void kernel_mul_mat_q6_k_f32(
|
||||||
device const block_q6_k * x = (device const block_q6_k *) src0 + r0*nb;
|
device const block_q6_k * x = (device const block_q6_k *) src0 + r0*nb;
|
||||||
device const float * yy = (device const float *) src1 + r1*ne10;
|
device const float * yy = (device const float *) src1 + r1*ne10;
|
||||||
|
|
||||||
const uint nth = tptg.x*tptg.y;
|
const int nth = tptg.x*tptg.y;
|
||||||
const uint ith = tptg.y*tpitg.x + tpitg.y;
|
const int ith = tptg.y*tpitg.x + tpitg.y;
|
||||||
|
|
||||||
const int step = QK_K / tptg.y; // we expect this to be 16
|
// Note: we absolutely assume that tptg.y = 16 and QK_K = 256!
|
||||||
const int iqs = step * tpitg.y; // 0...240 in steps of 16
|
const int iqs = 16 * tpitg.y;
|
||||||
const int ip = iqs / 128; // 0 or 1
|
const int ip = iqs / 128; // 0 or 1
|
||||||
const int il = (iqs - 128*ip)/16; // 0...7
|
const int il = (iqs - 128*ip)/16; // 0...7
|
||||||
const int n = 4;
|
const int n = 4;
|
||||||
const int is = 8*ip + (n*il)/16;
|
const int l0 = n*il;
|
||||||
|
const int is = 8*ip + l0/16;
|
||||||
|
|
||||||
|
const int y_offset = 128*ip + l0;
|
||||||
|
const int q_offset_l = 64*ip + l0;
|
||||||
|
const int q_offset_h = 32*ip + l0;
|
||||||
|
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
||||||
|
|
||||||
device const uint8_t * ql = x[i].ql + 64*ip + n*il;
|
device const uint8_t * ql = x[i].ql + q_offset_l;
|
||||||
device const uint8_t * qh = x[i].qh + 32*ip + n*il;
|
device const uint8_t * qh = x[i].qh + q_offset_h;
|
||||||
device const int8_t * sc = x[i].scales + is;
|
device const int8_t * sc = x[i].scales + is;
|
||||||
|
|
||||||
device const float * y = yy + i * QK_K + 128*ip + n*il;
|
device const float * y = yy + i * QK_K + y_offset;
|
||||||
|
|
||||||
const float dall = x[i].d;
|
const float dall = x[i].d;
|
||||||
|
|
||||||
|
|
497
ggml-opencl.cpp
497
ggml-opencl.cpp
|
@ -15,7 +15,11 @@
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#define CL_DMMV_BLOCK_SIZE 32;
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define CL_DMMV_BLOCK_SIZE 32
|
||||||
|
|
||||||
#define MULTILINE_QUOTE(...) #__VA_ARGS__
|
#define MULTILINE_QUOTE(...) #__VA_ARGS__
|
||||||
static std::string program_source = MULTILINE_QUOTE(
|
static std::string program_source = MULTILINE_QUOTE(
|
||||||
|
@ -59,6 +63,46 @@ struct __attribute__ ((packed)) block_q8_0
|
||||||
int8_t qs[QK8_0];
|
int8_t qs[QK8_0];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct __attribute__((packed)) block_q2_K
|
||||||
|
{
|
||||||
|
uint8_t scales[16];
|
||||||
|
uint8_t qs[64];
|
||||||
|
half d;
|
||||||
|
half dmin;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct __attribute__((packed)) block_q3_K
|
||||||
|
{
|
||||||
|
uint8_t hmask[32];
|
||||||
|
uint8_t qs[64];
|
||||||
|
uint8_t scales[12];
|
||||||
|
half d;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct __attribute__((packed)) block_q4_K
|
||||||
|
{
|
||||||
|
half d;
|
||||||
|
half dmin;
|
||||||
|
uint8_t scales[12];
|
||||||
|
uint8_t qs[128];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct __attribute__((packed)) block_q5_K
|
||||||
|
{
|
||||||
|
half d;
|
||||||
|
half dmin;
|
||||||
|
uint8_t scales[12];
|
||||||
|
uint8_t qh[32];
|
||||||
|
uint8_t qs[128];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct __attribute__((packed)) block_q6_K
|
||||||
|
{
|
||||||
|
uint8_t ql[128];
|
||||||
|
uint8_t qh[64];
|
||||||
|
int8_t scales[16];
|
||||||
|
half d;
|
||||||
|
};
|
||||||
|
|
||||||
__kernel void convert_fp16_to_fp32(__global half* x, __global float* y) {
|
__kernel void convert_fp16_to_fp32(__global half* x, __global float* y) {
|
||||||
const uint i = get_global_id(0);
|
const uint i = get_global_id(0);
|
||||||
|
@ -131,8 +175,314 @@ void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float
|
||||||
*v0 = vload_half(0, &x[ib + 0]);
|
*v0 = vload_half(0, &x[ib + 0]);
|
||||||
*v1 = vload_half(0, &x[ib + 1]);
|
*v1 = vload_half(0, &x[ib + 1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
|
||||||
|
{
|
||||||
|
if (j < 4)
|
||||||
|
{
|
||||||
|
*d = q[j] & 63;
|
||||||
|
*m = q[j + 4] & 63;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
*d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4);
|
||||||
|
*m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
|
||||||
|
{
|
||||||
|
const int i = get_group_id(0);
|
||||||
|
const int tid = get_local_id(0);
|
||||||
|
const int n = tid / 32;
|
||||||
|
const int l = tid - 32 * n;
|
||||||
|
const int is = 8 * n + l / 16;
|
||||||
|
|
||||||
|
const uint8_t q = x[i].qs[32 * n + l];
|
||||||
|
__global float *y = yy + i * 256 + 128 * n;
|
||||||
|
|
||||||
|
const float dall = vload_half(0, &x[i].d);
|
||||||
|
const float dmin = vload_half(0, &x[i].dmin);
|
||||||
|
|
||||||
|
y[l + 0] = dall * (x[i].scales[is + 0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is + 0] >> 4);
|
||||||
|
y[l + 32] = dall * (x[i].scales[is + 2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is + 2] >> 4);
|
||||||
|
y[l + 64] = dall * (x[i].scales[is + 4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is + 4] >> 4);
|
||||||
|
y[l + 96] = dall * (x[i].scales[is + 6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is + 6] >> 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
|
||||||
|
{
|
||||||
|
int r = get_local_id(0) / 4;
|
||||||
|
int i = get_group_id(0);
|
||||||
|
int tid = r / 2;
|
||||||
|
int is0 = r % 2;
|
||||||
|
int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
|
||||||
|
int n = tid / 4;
|
||||||
|
int j = tid - 4 * n;
|
||||||
|
|
||||||
|
uint8_t m = 1 << (4 * n + j);
|
||||||
|
int is = 8 * n + 2 * j + is0;
|
||||||
|
int shift = 2 * j;
|
||||||
|
|
||||||
|
int8_t us = is < 4 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 8] >> 0) & 3) << 4)
|
||||||
|
: is < 8 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 4] >> 2) & 3) << 4)
|
||||||
|
: is < 12 ? (x[i].scales[is - 8] >> 4) | (((x[i].scales[is + 0] >> 4) & 3) << 4)
|
||||||
|
: (x[i].scales[is - 8] >> 4) | (((x[i].scales[is - 4] >> 6) & 3) << 4);
|
||||||
|
float d_all = vload_half(0, &x[i].d);
|
||||||
|
float dl = d_all * (us - 32);
|
||||||
|
|
||||||
|
__global float *y = yy + i * 256 + 128 * n + 32 * j;
|
||||||
|
const __global uint8_t *q = x[i].qs + 32 * n;
|
||||||
|
const __global uint8_t *hm = x[i].hmask;
|
||||||
|
|
||||||
|
for (int l = l0; l < l0 + 4; ++l)
|
||||||
|
y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
|
||||||
|
{
|
||||||
|
const int i = get_group_id(0);
|
||||||
|
const int tid = get_local_id(0);
|
||||||
|
const int il = tid / 8;
|
||||||
|
const int ir = tid % 8;
|
||||||
|
const int is = 2 * il;
|
||||||
|
const int n = 4;
|
||||||
|
|
||||||
|
__global float *y = yy + i * 256 + 64 * il + n * ir;
|
||||||
|
|
||||||
|
const float dall = vload_half(0, &x[i].d);
|
||||||
|
const float dmin = vload_half(0, &x[i].dmin);
|
||||||
|
|
||||||
|
__global const uint8_t *q = x[i].qs + 32 * il + n * ir;
|
||||||
|
|
||||||
|
uint8_t sc, m;
|
||||||
|
get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
|
||||||
|
float d1 = dall * sc;
|
||||||
|
float m1 = dmin * m;
|
||||||
|
get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
|
||||||
|
float d2 = dall * sc;
|
||||||
|
float m2 = dmin * m;
|
||||||
|
for (int l = 0; l < n; ++l)
|
||||||
|
{
|
||||||
|
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
||||||
|
y[l + 32] = d2 * (q[l] >> 4) - m2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
|
||||||
|
{
|
||||||
|
const int i = get_group_id(0);
|
||||||
|
const int tid = get_local_id(0);
|
||||||
|
const int il = tid / 16;
|
||||||
|
const int ir = tid % 16;
|
||||||
|
const int is = 2 * il;
|
||||||
|
|
||||||
|
__global float *y = yy + i * 256 + 64 * il + 2 * ir;
|
||||||
|
|
||||||
|
const float dall = vload_half(0, &x[i].d);
|
||||||
|
const float dmin = vload_half(0, &x[i].dmin);
|
||||||
|
|
||||||
|
__global const uint8_t *ql = x[i].qs + 32 * il + 2 * ir;
|
||||||
|
__global const uint8_t *qh = x[i].qh + 2 * ir;
|
||||||
|
|
||||||
|
uint8_t sc, m;
|
||||||
|
get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
|
||||||
|
const float d1 = dall * sc;
|
||||||
|
const float m1 = dmin * m;
|
||||||
|
get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
|
||||||
|
const float d2 = dall * sc;
|
||||||
|
const float m2 = dmin * m;
|
||||||
|
|
||||||
|
uint8_t hm = 1 << (2 * il);
|
||||||
|
y[0] = d1 * ((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0)) - m1;
|
||||||
|
y[1] = d1 * ((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0)) - m1;
|
||||||
|
hm <<= 1;
|
||||||
|
y[32] = d2 * ((ql[0] >> 4) + (qh[0] & hm ? 16 : 0)) - m2;
|
||||||
|
y[33] = d2 * ((ql[1] >> 4) + (qh[1] & hm ? 16 : 0)) - m2;
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
|
||||||
|
{
|
||||||
|
const int i = get_group_id(0);
|
||||||
|
const int tid = get_local_id(0);
|
||||||
|
const int ip = tid / 32;
|
||||||
|
const int il = tid - 32 * ip;
|
||||||
|
const int is = 8 * ip + il / 16;
|
||||||
|
|
||||||
|
__global float *y = yy + i * 256 + 128 * ip + il;
|
||||||
|
|
||||||
|
const float d = vload_half(0, &x[i].d);
|
||||||
|
|
||||||
|
__global const uint8_t *ql = x[i].ql + 64 * ip + il;
|
||||||
|
const uint8_t qh = x[i].qh[32 * ip + il];
|
||||||
|
__global const int8_t *sc = x[i].scales + is;
|
||||||
|
|
||||||
|
y[0] = d * sc[0] * ((int8_t)((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
||||||
|
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
||||||
|
y[64] = d * sc[4] * ((int8_t)((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
||||||
|
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void vec_dot_q2_K(__global const struct block_q2_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
|
||||||
|
|
||||||
|
int n = iqs / 128;
|
||||||
|
int r = iqs - 128 * n;
|
||||||
|
int l = r / 8;
|
||||||
|
|
||||||
|
__global const float *y = yy + 128 * n + l;
|
||||||
|
__global const uint8_t *q = x[ib].qs + 32 * n + l;
|
||||||
|
__global const uint8_t *s = x[ib].scales + 8 * n;
|
||||||
|
|
||||||
|
const float dall = vload_half(0, &x[ib].d);
|
||||||
|
const float dmin = vload_half(0, &x[ib].dmin);
|
||||||
|
|
||||||
|
float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
|
||||||
|
+ y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
|
||||||
|
+ y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
|
||||||
|
+ y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
|
||||||
|
+ y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
|
||||||
|
+ y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
|
||||||
|
+ y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
|
||||||
|
+ y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
|
||||||
|
|
||||||
|
*result = sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
void vec_dot_q3_K(__global const struct block_q3_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
|
||||||
|
|
||||||
|
const uint32_t kmask1 = 0x03030303;
|
||||||
|
const uint32_t kmask2 = 0x0f0f0f0f;
|
||||||
|
|
||||||
|
uint32_t aux[3];
|
||||||
|
uint32_t utmp[4];
|
||||||
|
|
||||||
|
int n = iqs/128;
|
||||||
|
int r = iqs - 128*n;
|
||||||
|
int l = r/8;
|
||||||
|
|
||||||
|
__global const float * y = yy + 128*n + l;
|
||||||
|
__global const uint8_t * q = x[ib].qs + 32*n + l;
|
||||||
|
__global const uint8_t * hm = x[ib].hmask + l;
|
||||||
|
const int8_t * s = (const int8_t *)utmp + 8*n;
|
||||||
|
|
||||||
|
aux[0] = x[ib].scales[0] | x[ib].scales[1] << 8 | x[ib].scales[2] << 16 | x[ib].scales[3] << 24;
|
||||||
|
aux[1] = x[ib].scales[4] | x[ib].scales[5] << 8 | x[ib].scales[6] << 16 | x[ib].scales[7] << 24;
|
||||||
|
aux[2] = x[ib].scales[8] | x[ib].scales[9] << 8 | x[ib].scales[10] << 16 | x[ib].scales[11] << 24;
|
||||||
|
|
||||||
|
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
|
||||||
|
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
|
||||||
|
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
|
||||||
|
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
|
||||||
|
|
||||||
|
const float dall = vload_half(0, &x[ib].d);
|
||||||
|
const uint8_t m = 1 << (4*n);
|
||||||
|
|
||||||
|
float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
|
||||||
|
+ y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
|
||||||
|
+ y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
|
||||||
|
+ y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
|
||||||
|
+ y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
|
||||||
|
+ y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
|
||||||
|
+ y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
|
||||||
|
+ y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
|
||||||
|
|
||||||
|
*result = sum * dall;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void vec_dot_q4_K(__global const struct block_q4_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
|
||||||
|
|
||||||
|
const int j = iqs / 64; // j is in 0...3
|
||||||
|
const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
|
||||||
|
const int is = 2*j; // is is in 0...6 in steps of 2
|
||||||
|
|
||||||
|
__global const float * y = yy + 64*j + ir;
|
||||||
|
__global const uint8_t * q = x[ib].qs + 32*j + ir;
|
||||||
|
|
||||||
|
const float dall = vload_half(0, &x[ib].d);
|
||||||
|
const float dmin = vload_half(0, &x[ib].dmin);
|
||||||
|
|
||||||
|
uint8_t sc, m;
|
||||||
|
get_scale_min_k4(is + 0, x[ib].scales, &sc, &m);
|
||||||
|
const float d1 = dall * sc;
|
||||||
|
const float m1 = dmin * m;
|
||||||
|
get_scale_min_k4(is + 1, x[ib].scales, &sc, &m);
|
||||||
|
const float d2 = dall * sc;
|
||||||
|
const float m2 = dmin * m;
|
||||||
|
|
||||||
|
float sum = 0;
|
||||||
|
for (int k = 0; k < 4; ++k) {
|
||||||
|
sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
|
||||||
|
sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
|
||||||
|
}
|
||||||
|
|
||||||
|
*result = sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
void vec_dot_q5_K(__global const struct block_q5_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
|
||||||
|
|
||||||
|
const int j = iqs / 64;
|
||||||
|
const int ir = (iqs - 64*j)/2;
|
||||||
|
const int is = 2*j;
|
||||||
|
|
||||||
|
__global const float * y = yy + 64*j + ir;
|
||||||
|
__global const uint8_t * ql = x[ib].qs + 32*j + ir;
|
||||||
|
__global const uint8_t * qh = x[ib].qh + ir;
|
||||||
|
|
||||||
|
const float dall = vload_half(0, &x[ib].d);
|
||||||
|
const float dmin = vload_half(0, &x[ib].dmin);
|
||||||
|
|
||||||
|
uint8_t sc, m;
|
||||||
|
get_scale_min_k4(is + 0, x[ib].scales, &sc, &m);
|
||||||
|
const float d1 = dall * sc;
|
||||||
|
const float m1 = dmin * m;
|
||||||
|
get_scale_min_k4(is + 1, x[ib].scales, &sc, &m);
|
||||||
|
const float d2 = dall * sc;
|
||||||
|
const float m2 = dmin * m;
|
||||||
|
|
||||||
|
uint8_t hm = 1 << is;
|
||||||
|
float sum = 0;
|
||||||
|
for (int k = 0; k < 4; ++k) {
|
||||||
|
sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
|
||||||
|
}
|
||||||
|
hm <<= 1;
|
||||||
|
for (int k = 0; k < 4; ++k) {
|
||||||
|
sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
|
||||||
|
}
|
||||||
|
*result = sum;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void vec_dot_q6_K(__global const struct block_q6_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
|
||||||
|
|
||||||
|
|
||||||
|
const int ip = iqs / 128; // 0 or 1
|
||||||
|
const int il = (iqs - 128*ip)/8; // 0...15
|
||||||
|
const int is = 8*ip;
|
||||||
|
|
||||||
|
__global const float * y = yy + 128*ip + il;
|
||||||
|
|
||||||
|
const float d = vload_half(0, &x[ib].d);
|
||||||
|
|
||||||
|
__global const uint8_t * ql = x[ib].ql + 64*ip + il;
|
||||||
|
__global const uint8_t * qh = x[ib].qh + 32*ip + il;
|
||||||
|
__global const int8_t * sc = x[ib].scales + is;
|
||||||
|
|
||||||
|
*result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32)
|
||||||
|
+ y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32)
|
||||||
|
+ y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32)
|
||||||
|
+ y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32)
|
||||||
|
+ y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32)
|
||||||
|
+ y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32)
|
||||||
|
+ y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32)
|
||||||
|
+ y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
std::string dequant_template = MULTILINE_QUOTE(
|
std::string dequant_template = MULTILINE_QUOTE(
|
||||||
__kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
|
__kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
|
||||||
const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2;
|
const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2;
|
||||||
|
@ -160,7 +510,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
|
||||||
std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
|
std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
|
||||||
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
||||||
const int block_size = get_local_size(0);
|
const int block_size = get_local_size(0);
|
||||||
const int row = get_global_id(0) / block_size;
|
const int row = get_group_id(0);
|
||||||
const int tid = get_local_id(0);
|
const int tid = get_local_id(0);
|
||||||
|
|
||||||
const uint qk = QUANT_K;
|
const uint qk = QUANT_K;
|
||||||
|
@ -199,6 +549,45 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
std::string dequant_mul_mat_vec_k_template = MULTILINE_QUOTE(
|
||||||
|
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
||||||
|
const int block_size = get_local_size(0);
|
||||||
|
const int row = get_group_id(0);
|
||||||
|
const int tid = get_local_id(0);
|
||||||
|
|
||||||
|
const int iter_stride = 256;
|
||||||
|
const int vals_per_iter = iter_stride / block_size;
|
||||||
|
const int num_blocks_per_row = ncols / 256;
|
||||||
|
const int ib0 = row*num_blocks_per_row;
|
||||||
|
|
||||||
|
tmp[tid] = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < ncols; i += iter_stride) {
|
||||||
|
const int col = i + vals_per_iter*tid;
|
||||||
|
const int ib = ib0 + col/256; // x block index
|
||||||
|
const int iqs = col%256; // x quant index
|
||||||
|
const int iybs = col - col%256; // y block start index
|
||||||
|
|
||||||
|
// dequantize
|
||||||
|
float v;
|
||||||
|
DOT_KERNEL(x, ib, iqs, y + iybs, &v);
|
||||||
|
tmp[tid] += v;
|
||||||
|
}
|
||||||
|
|
||||||
|
// sum up partial sums and write back result
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
for (int s=block_size/2; s>0; s>>=1) {
|
||||||
|
if (tid < s) {
|
||||||
|
tmp[tid] += tmp[tid + s];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
if (tid == 0) {
|
||||||
|
dst[row] = tmp[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
std::string mul_template = MULTILINE_QUOTE(
|
std::string mul_template = MULTILINE_QUOTE(
|
||||||
__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
|
__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
|
||||||
const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
|
const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
|
||||||
|
@ -260,6 +649,18 @@ std::array<std::string, 2> mul_str_values = {
|
||||||
"mul_f32", "float"
|
"mul_f32", "float"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
std::array<std::string, 3> dmmv_k_str_keys = {
|
||||||
|
"KERNEL_NAME", "X_TYPE", "DOT_KERNEL"
|
||||||
|
};
|
||||||
|
|
||||||
|
std::array<std::string, 15> dmmv_k_str_values = {
|
||||||
|
"dequantize_mul_mat_vec_q2_K", "struct block_q2_K", "vec_dot_q2_K",
|
||||||
|
"dequantize_mul_mat_vec_q3_K", "struct block_q3_K", "vec_dot_q3_K",
|
||||||
|
"dequantize_mul_mat_vec_q4_K", "struct block_q4_K", "vec_dot_q4_K",
|
||||||
|
"dequantize_mul_mat_vec_q5_K", "struct block_q5_K", "vec_dot_q5_K",
|
||||||
|
"dequantize_mul_mat_vec_q6_K", "struct block_q6_K", "vec_dot_q6_K",
|
||||||
|
};
|
||||||
|
|
||||||
std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
||||||
size_t pos = 0;
|
size_t pos = 0;
|
||||||
while ((pos = s.find(from, pos)) != std::string::npos) {
|
while ((pos = s.find(from, pos)) != std::string::npos) {
|
||||||
|
@ -289,6 +690,14 @@ std::string generate_kernels() {
|
||||||
}
|
}
|
||||||
src << mul_kernel << '\n';
|
src << mul_kernel << '\n';
|
||||||
}
|
}
|
||||||
|
for (size_t i = 0; i < dmmv_k_str_values.size(); i += dmmv_k_str_keys.size()) {
|
||||||
|
std::string dmmv_k_kernel = dequant_mul_mat_vec_k_template;
|
||||||
|
for (size_t j = 0; j < dmmv_k_str_keys.size(); j++) {
|
||||||
|
replace(dmmv_k_kernel, dmmv_k_str_keys[j], dmmv_k_str_values[i + j]);
|
||||||
|
}
|
||||||
|
src << dmmv_k_kernel << '\n';
|
||||||
|
}
|
||||||
|
|
||||||
return src.str();
|
return src.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -300,6 +709,8 @@ static cl_program program;
|
||||||
static cl_kernel convert_row_f16_cl;
|
static cl_kernel convert_row_f16_cl;
|
||||||
static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
|
static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
|
||||||
static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
|
static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
|
||||||
|
static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
|
||||||
|
static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
|
||||||
static cl_kernel mul_f32_cl;
|
static cl_kernel mul_f32_cl;
|
||||||
static bool fp16_support;
|
static bool fp16_support;
|
||||||
|
|
||||||
|
@ -529,6 +940,12 @@ void ggml_cl_init(void) {
|
||||||
CL_CHECK((dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err), err));
|
CL_CHECK((dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err), err));
|
||||||
CL_CHECK((dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err), err));
|
CL_CHECK((dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err), err));
|
||||||
CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
|
CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
|
||||||
|
CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
|
||||||
|
CL_CHECK((dequantize_block_q2_k_cl = clCreateKernel(program, "dequantize_block_q2_K", &err), err));
|
||||||
|
CL_CHECK((dequantize_block_q3_k_cl = clCreateKernel(program, "dequantize_block_q3_K", &err), err));
|
||||||
|
CL_CHECK((dequantize_block_q4_k_cl = clCreateKernel(program, "dequantize_block_q4_K", &err), err));
|
||||||
|
CL_CHECK((dequantize_block_q5_k_cl = clCreateKernel(program, "dequantize_block_q5_K", &err), err));
|
||||||
|
CL_CHECK((dequantize_block_q6_k_cl = clCreateKernel(program, "dequantize_block_q6_K", &err), err));
|
||||||
|
|
||||||
// dequant mul mat kernel
|
// dequant mul mat kernel
|
||||||
CL_CHECK((dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err), err));
|
CL_CHECK((dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err), err));
|
||||||
|
@ -537,6 +954,11 @@ void ggml_cl_init(void) {
|
||||||
CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
|
CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
|
||||||
CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
|
CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
|
||||||
CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
|
CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
|
||||||
|
CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K", &err), err));
|
||||||
|
CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K", &err), err));
|
||||||
|
CL_CHECK((dequantize_mul_mat_vec_q4_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_K", &err), err));
|
||||||
|
CL_CHECK((dequantize_mul_mat_vec_q5_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_K", &err), err));
|
||||||
|
CL_CHECK((dequantize_mul_mat_vec_q6_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q6_K", &err), err));
|
||||||
|
|
||||||
// mul kernel
|
// mul kernel
|
||||||
CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
|
CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
|
||||||
|
@ -554,6 +976,16 @@ static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
|
||||||
return &dequantize_row_q5_1_cl;
|
return &dequantize_row_q5_1_cl;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
return &dequantize_row_q8_0_cl;
|
return &dequantize_row_q8_0_cl;
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
return &dequantize_block_q2_k_cl;
|
||||||
|
case GGML_TYPE_Q3_K:
|
||||||
|
return &dequantize_block_q3_k_cl;
|
||||||
|
case GGML_TYPE_Q4_K:
|
||||||
|
return &dequantize_block_q4_k_cl;
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
|
return &dequantize_block_q5_k_cl;
|
||||||
|
case GGML_TYPE_Q6_K:
|
||||||
|
return &dequantize_block_q6_k_cl;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
return &convert_row_f16_cl;
|
return &convert_row_f16_cl;
|
||||||
default:
|
default:
|
||||||
|
@ -561,6 +993,50 @@ static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static size_t ggml_cl_global_denom(ggml_type type) {
|
||||||
|
switch (type) {
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
|
case GGML_TYPE_Q4_1:
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
case GGML_TYPE_Q5_1:
|
||||||
|
case GGML_TYPE_Q8_0:
|
||||||
|
return 1;
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
case GGML_TYPE_Q3_K:
|
||||||
|
return 4;
|
||||||
|
case GGML_TYPE_Q4_K:
|
||||||
|
return 8;
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
|
case GGML_TYPE_Q6_K:
|
||||||
|
return 4;
|
||||||
|
case GGML_TYPE_F16:
|
||||||
|
default:
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t ggml_cl_local_size(ggml_type type) {
|
||||||
|
switch (type) {
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
|
case GGML_TYPE_Q4_1:
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
case GGML_TYPE_Q5_1:
|
||||||
|
case GGML_TYPE_Q8_0:
|
||||||
|
return 0;
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
case GGML_TYPE_Q3_K:
|
||||||
|
return 64;
|
||||||
|
case GGML_TYPE_Q4_K:
|
||||||
|
return 32;
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
|
case GGML_TYPE_Q6_K:
|
||||||
|
return 64;
|
||||||
|
case GGML_TYPE_F16:
|
||||||
|
default:
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) {
|
static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
|
@ -575,6 +1051,16 @@ static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) {
|
||||||
return &dequantize_mul_mat_vec_q8_0_cl;
|
return &dequantize_mul_mat_vec_q8_0_cl;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
return &convert_mul_mat_vec_f16_cl;
|
return &convert_mul_mat_vec_f16_cl;
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
return &dequantize_mul_mat_vec_q2_K_cl;
|
||||||
|
case GGML_TYPE_Q3_K:
|
||||||
|
return &dequantize_mul_mat_vec_q3_K_cl;
|
||||||
|
case GGML_TYPE_Q4_K:
|
||||||
|
return &dequantize_mul_mat_vec_q4_K_cl;
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
|
return &dequantize_mul_mat_vec_q5_K_cl;
|
||||||
|
case GGML_TYPE_Q6_K:
|
||||||
|
return &dequantize_mul_mat_vec_q6_K_cl;
|
||||||
default:
|
default:
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
@ -1017,6 +1503,9 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
|
cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
|
||||||
GGML_ASSERT(to_fp32_cl != nullptr);
|
GGML_ASSERT(to_fp32_cl != nullptr);
|
||||||
|
|
||||||
|
const size_t global_denom = ggml_cl_global_denom(type);
|
||||||
|
const size_t local = ggml_cl_local_size(type);
|
||||||
|
|
||||||
size_t ev_idx = 0;
|
size_t ev_idx = 0;
|
||||||
std::vector<cl_event> events;
|
std::vector<cl_event> events;
|
||||||
|
|
||||||
|
@ -1049,10 +1538,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
||||||
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
||||||
// convert src0 to fp32 on device
|
// convert src0 to fp32 on device
|
||||||
const size_t global = x_ne;
|
const size_t global = x_ne / global_denom;
|
||||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
||||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
||||||
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
||||||
|
|
||||||
// copy src1 to device
|
// copy src1 to device
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
||||||
|
|
334
ggml.h
334
ggml.h
|
@ -296,12 +296,14 @@ extern "C" {
|
||||||
GGML_OP_SUM_ROWS,
|
GGML_OP_SUM_ROWS,
|
||||||
GGML_OP_MEAN,
|
GGML_OP_MEAN,
|
||||||
GGML_OP_REPEAT,
|
GGML_OP_REPEAT,
|
||||||
|
GGML_OP_REPEAT_BACK,
|
||||||
GGML_OP_ABS,
|
GGML_OP_ABS,
|
||||||
GGML_OP_SGN,
|
GGML_OP_SGN,
|
||||||
GGML_OP_NEG,
|
GGML_OP_NEG,
|
||||||
GGML_OP_STEP,
|
GGML_OP_STEP,
|
||||||
GGML_OP_RELU,
|
GGML_OP_RELU,
|
||||||
GGML_OP_GELU,
|
GGML_OP_GELU,
|
||||||
|
GGML_OP_GELU_QUICK,
|
||||||
GGML_OP_SILU,
|
GGML_OP_SILU,
|
||||||
GGML_OP_SILU_BACK,
|
GGML_OP_SILU_BACK,
|
||||||
GGML_OP_NORM, // normalize
|
GGML_OP_NORM, // normalize
|
||||||
|
@ -309,6 +311,7 @@ extern "C" {
|
||||||
GGML_OP_RMS_NORM_BACK,
|
GGML_OP_RMS_NORM_BACK,
|
||||||
|
|
||||||
GGML_OP_MUL_MAT,
|
GGML_OP_MUL_MAT,
|
||||||
|
GGML_OP_OUT_PROD,
|
||||||
|
|
||||||
GGML_OP_SCALE,
|
GGML_OP_SCALE,
|
||||||
GGML_OP_SET,
|
GGML_OP_SET,
|
||||||
|
@ -324,19 +327,31 @@ extern "C" {
|
||||||
GGML_OP_DIAG_MASK_INF,
|
GGML_OP_DIAG_MASK_INF,
|
||||||
GGML_OP_DIAG_MASK_ZERO,
|
GGML_OP_DIAG_MASK_ZERO,
|
||||||
GGML_OP_SOFT_MAX,
|
GGML_OP_SOFT_MAX,
|
||||||
|
GGML_OP_SOFT_MAX_BACK,
|
||||||
GGML_OP_ROPE,
|
GGML_OP_ROPE,
|
||||||
GGML_OP_ROPE_BACK,
|
GGML_OP_ROPE_BACK,
|
||||||
GGML_OP_ALIBI,
|
GGML_OP_ALIBI,
|
||||||
GGML_OP_CLAMP,
|
GGML_OP_CLAMP,
|
||||||
GGML_OP_CONV_1D_1S,
|
GGML_OP_CONV_1D_S1_PH,
|
||||||
GGML_OP_CONV_1D_2S,
|
GGML_OP_CONV_1D_S2_PH,
|
||||||
|
GGML_OP_CONV_2D_SK_P0,
|
||||||
|
|
||||||
GGML_OP_FLASH_ATTN,
|
GGML_OP_FLASH_ATTN,
|
||||||
GGML_OP_FLASH_FF,
|
GGML_OP_FLASH_FF,
|
||||||
|
GGML_OP_FLASH_ATTN_BACK,
|
||||||
|
GGML_OP_WIN_PART,
|
||||||
|
GGML_OP_WIN_UNPART,
|
||||||
|
|
||||||
GGML_OP_MAP_UNARY,
|
GGML_OP_MAP_UNARY,
|
||||||
GGML_OP_MAP_BINARY,
|
GGML_OP_MAP_BINARY,
|
||||||
|
|
||||||
|
GGML_OP_MAP_CUSTOM1,
|
||||||
|
GGML_OP_MAP_CUSTOM2,
|
||||||
|
GGML_OP_MAP_CUSTOM3,
|
||||||
|
|
||||||
|
GGML_OP_CROSS_ENTROPY_LOSS,
|
||||||
|
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
||||||
|
|
||||||
GGML_OP_COUNT,
|
GGML_OP_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -478,6 +493,7 @@ extern "C" {
|
||||||
|
|
||||||
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
||||||
|
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
// use this to compute the memory overhead of a tensor
|
// use this to compute the memory overhead of a tensor
|
||||||
GGML_API size_t ggml_tensor_overhead(void);
|
GGML_API size_t ggml_tensor_overhead(void);
|
||||||
|
@ -492,8 +508,9 @@ extern "C" {
|
||||||
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
||||||
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
||||||
|
|
||||||
GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
|
GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
|
||||||
GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
|
GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
|
||||||
|
GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_new_tensor(
|
GGML_API struct ggml_tensor * ggml_new_tensor(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
@ -549,7 +566,8 @@ extern "C" {
|
||||||
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
|
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
|
GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
|
||||||
|
GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
|
||||||
|
|
||||||
//
|
//
|
||||||
// operations on tensors with backpropagation
|
// operations on tensors with backpropagation
|
||||||
|
@ -574,6 +592,11 @@ extern "C" {
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_add1_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_acc(
|
GGML_API struct ggml_tensor * ggml_acc(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
@ -597,24 +620,47 @@ extern "C" {
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_sub_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_mul(
|
GGML_API struct ggml_tensor * ggml_mul(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_mul_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_div(
|
GGML_API struct ggml_tensor * ggml_div(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_div_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_sqr(
|
GGML_API struct ggml_tensor * ggml_sqr(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_sqr_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_sqrt(
|
GGML_API struct ggml_tensor * ggml_sqrt(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_sqrt_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_log(
|
GGML_API struct ggml_tensor * ggml_log(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
@ -645,35 +691,76 @@ extern "C" {
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_repeat_back(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_abs(
|
GGML_API struct ggml_tensor * ggml_abs(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_abs_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_sgn(
|
GGML_API struct ggml_tensor * ggml_sgn(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_sgn_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_neg(
|
GGML_API struct ggml_tensor * ggml_neg(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_neg_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_step(
|
GGML_API struct ggml_tensor * ggml_step(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_step_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_relu(
|
GGML_API struct ggml_tensor * ggml_relu(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
// TODO: double-check this computation is correct
|
// TODO: double-check this computation is correct
|
||||||
GGML_API struct ggml_tensor * ggml_gelu(
|
GGML_API struct ggml_tensor * ggml_gelu(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_gelu_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_gelu_quick(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_silu(
|
GGML_API struct ggml_tensor * ggml_silu(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_silu_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
// a - x
|
// a - x
|
||||||
// b - dy
|
// b - dy
|
||||||
GGML_API struct ggml_tensor * ggml_silu_back(
|
GGML_API struct ggml_tensor * ggml_silu_back(
|
||||||
|
@ -687,10 +774,18 @@ extern "C" {
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_norm_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_rms_norm(
|
GGML_API struct ggml_tensor * ggml_rms_norm(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
// a - x
|
// a - x
|
||||||
// b - dy
|
// b - dy
|
||||||
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
||||||
|
@ -698,14 +793,22 @@ extern "C" {
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
// A: m rows, n columns
|
// A: n columns, m rows
|
||||||
// B: p rows, n columns (i.e. we transpose it internally)
|
// B: n columns, p rows (i.e. we transpose it internally)
|
||||||
// result is m columns, p rows
|
// result is m columns, p rows
|
||||||
GGML_API struct ggml_tensor * ggml_mul_mat(
|
GGML_API struct ggml_tensor * ggml_mul_mat(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
// A: m columns, n rows,
|
||||||
|
// B: p columns, n rows,
|
||||||
|
// result is m columns, p rows
|
||||||
|
GGML_API struct ggml_tensor * ggml_out_prod(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
//
|
//
|
||||||
// operations on tensors without backpropagation
|
// operations on tensors without backpropagation
|
||||||
//
|
//
|
||||||
|
@ -916,6 +1019,17 @@ extern "C" {
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
// in-place, returns view(a)
|
||||||
|
GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
// rotary position embedding
|
// rotary position embedding
|
||||||
// if mode & 1 == 1, skip n_past elements
|
// if mode & 1 == 1, skip n_past elements
|
||||||
// if mode & 2 == 1, GPT-NeoX style
|
// if mode & 2 == 1, GPT-NeoX style
|
||||||
|
@ -961,16 +1075,55 @@ extern "C" {
|
||||||
float min,
|
float min,
|
||||||
float max);
|
float max);
|
||||||
|
|
||||||
// padding = 1
|
// TODO: implement general-purpose convolutions
|
||||||
|
// GGML_API struct ggml_tensor * ggml_conv_1d(
|
||||||
|
// struct ggml_context * ctx,
|
||||||
|
// struct ggml_tensor * a,
|
||||||
|
// struct ggml_tensor * b,
|
||||||
|
// int s0
|
||||||
|
// int p0,
|
||||||
|
// int d0);
|
||||||
|
//
|
||||||
|
// GGML_API struct ggml_tensor * ggml_conv_2d(
|
||||||
|
// struct ggml_context * ctx,
|
||||||
|
// struct ggml_tensor * a,
|
||||||
|
// struct ggml_tensor * b,
|
||||||
|
// int s0,
|
||||||
|
// int s1,
|
||||||
|
// int p0,
|
||||||
|
// int p1,
|
||||||
|
// int d0,
|
||||||
|
// int d1);
|
||||||
|
|
||||||
|
// padding = half
|
||||||
// TODO: we don't support extra parameters for now
|
// TODO: we don't support extra parameters for now
|
||||||
// that's why we are hard-coding the stride, padding, and dilation
|
// that's why we are hard-coding the stride, padding, and dilation
|
||||||
// not great ..
|
// not great ..
|
||||||
GGML_API struct ggml_tensor * ggml_conv_1d_1s(
|
// example:
|
||||||
|
// a: 3 80 768 1
|
||||||
|
// b: 3000 80 1 1
|
||||||
|
// res: 3000 768 1 1
|
||||||
|
// used in whisper
|
||||||
|
GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_conv_1d_2s(
|
// used in whisper
|
||||||
|
GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
// kernel size is a->ne[0] x a->ne[1]
|
||||||
|
// stride is equal to kernel size
|
||||||
|
// padding is zero
|
||||||
|
// example:
|
||||||
|
// a: 16 16 3 768
|
||||||
|
// b: 1024 1024 3 1
|
||||||
|
// res: 64 64 768 1
|
||||||
|
// used in sam
|
||||||
|
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
@ -982,6 +1135,14 @@ extern "C" {
|
||||||
struct ggml_tensor * v,
|
struct ggml_tensor * v,
|
||||||
bool masked);
|
bool masked);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * q,
|
||||||
|
struct ggml_tensor * k,
|
||||||
|
struct ggml_tensor * v,
|
||||||
|
struct ggml_tensor * d,
|
||||||
|
bool masked);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_flash_ff(
|
GGML_API struct ggml_tensor * ggml_flash_ff(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
@ -990,21 +1151,106 @@ extern "C" {
|
||||||
struct ggml_tensor * c0,
|
struct ggml_tensor * c0,
|
||||||
struct ggml_tensor * c1);
|
struct ggml_tensor * c1);
|
||||||
|
|
||||||
// Mapping operations
|
// partition into non-overlapping windows with padding if needed
|
||||||
|
// example:
|
||||||
|
// a: 768 64 64 1
|
||||||
|
// w: 14
|
||||||
|
// res: 768 14 14 25
|
||||||
|
// used in sam
|
||||||
|
GGML_API struct ggml_tensor * ggml_win_part(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int w);
|
||||||
|
|
||||||
|
// reverse of ggml_win_part
|
||||||
|
// used in sam
|
||||||
|
GGML_API struct ggml_tensor * ggml_win_unpart(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int w0,
|
||||||
|
int h0,
|
||||||
|
int w);
|
||||||
|
|
||||||
|
// custom operators
|
||||||
|
|
||||||
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
||||||
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
||||||
|
|
||||||
|
typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
|
||||||
|
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
||||||
|
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
ggml_unary_op_f32_t fun);
|
ggml_unary_op_f32_t fun);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
ggml_unary_op_f32_t fun);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
ggml_binary_op_f32_t fun);
|
ggml_binary_op_f32_t fun);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
ggml_binary_op_f32_t fun);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_map_custom1_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
ggml_custom1_op_f32_t fun);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
ggml_custom1_op_f32_t fun);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_map_custom2_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
ggml_custom2_op_f32_t fun);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
ggml_custom2_op_f32_t fun);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_map_custom3_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
struct ggml_tensor * c,
|
||||||
|
ggml_custom3_op_f32_t fun);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
struct ggml_tensor * c,
|
||||||
|
ggml_custom3_op_f32_t fun);
|
||||||
|
|
||||||
|
// loss function
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
struct ggml_tensor * c);
|
||||||
|
|
||||||
//
|
//
|
||||||
// automatic differentiation
|
// automatic differentiation
|
||||||
//
|
//
|
||||||
|
@ -1099,6 +1345,8 @@ extern "C" {
|
||||||
struct {
|
struct {
|
||||||
int n_iter;
|
int n_iter;
|
||||||
|
|
||||||
|
float sched; // schedule multiplier (fixed, decay or warmup)
|
||||||
|
float decay; // weight decay for AdamW, use 0.0f to disable
|
||||||
float alpha; // learning rate
|
float alpha; // learning rate
|
||||||
float beta1;
|
float beta1;
|
||||||
float beta2;
|
float beta2;
|
||||||
|
@ -1123,6 +1371,49 @@ extern "C" {
|
||||||
} lbfgs;
|
} lbfgs;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct ggml_opt_context {
|
||||||
|
struct ggml_context * ctx;
|
||||||
|
struct ggml_opt_params params;
|
||||||
|
|
||||||
|
int iter;
|
||||||
|
int64_t nx; // number of parameter elements
|
||||||
|
|
||||||
|
bool just_initialized;
|
||||||
|
|
||||||
|
struct {
|
||||||
|
struct ggml_tensor * x; // view of the parameters
|
||||||
|
struct ggml_tensor * g1; // gradient
|
||||||
|
struct ggml_tensor * g2; // gradient squared
|
||||||
|
struct ggml_tensor * m; // first moment
|
||||||
|
struct ggml_tensor * v; // second moment
|
||||||
|
struct ggml_tensor * mh; // first moment hat
|
||||||
|
struct ggml_tensor * vh; // second moment hat
|
||||||
|
struct ggml_tensor * pf; // past function values
|
||||||
|
float fx_best;
|
||||||
|
float fx_prev;
|
||||||
|
int n_no_improvement;
|
||||||
|
} adam;
|
||||||
|
|
||||||
|
struct {
|
||||||
|
struct ggml_tensor * x; // current parameters
|
||||||
|
struct ggml_tensor * xp; // previous parameters
|
||||||
|
struct ggml_tensor * g; // current gradient
|
||||||
|
struct ggml_tensor * gp; // previous gradient
|
||||||
|
struct ggml_tensor * d; // search direction
|
||||||
|
struct ggml_tensor * pf; // past function values
|
||||||
|
struct ggml_tensor * lmal; // the L-BFGS memory alpha
|
||||||
|
struct ggml_tensor * lmys; // the L-BFGS memory ys
|
||||||
|
struct ggml_tensor * lms; // the L-BFGS memory s
|
||||||
|
struct ggml_tensor * lmy; // the L-BFGS memory y
|
||||||
|
float fx_best;
|
||||||
|
float step;
|
||||||
|
int j;
|
||||||
|
int k;
|
||||||
|
int end;
|
||||||
|
int n_no_improvement;
|
||||||
|
} lbfgs;
|
||||||
|
};
|
||||||
|
|
||||||
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
|
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
|
||||||
|
|
||||||
// optimize the function defined by the tensor f
|
// optimize the function defined by the tensor f
|
||||||
|
@ -1131,6 +1422,27 @@ extern "C" {
|
||||||
struct ggml_opt_params params,
|
struct ggml_opt_params params,
|
||||||
struct ggml_tensor * f);
|
struct ggml_tensor * f);
|
||||||
|
|
||||||
|
// initialize optimizer context
|
||||||
|
GGML_API void ggml_opt_init(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_opt_context * opt,
|
||||||
|
struct ggml_opt_params params,
|
||||||
|
int64_t nx);
|
||||||
|
|
||||||
|
// continue optimizing the function defined by the tensor f
|
||||||
|
GGML_API enum ggml_opt_result ggml_opt_resume(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_opt_context * opt,
|
||||||
|
struct ggml_tensor * f);
|
||||||
|
|
||||||
|
// continue optimizing the function defined by the tensor f
|
||||||
|
GGML_API enum ggml_opt_result ggml_opt_resume_g(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_opt_context * opt,
|
||||||
|
struct ggml_tensor * f,
|
||||||
|
struct ggml_cgraph * gf,
|
||||||
|
struct ggml_cgraph * gb);
|
||||||
|
|
||||||
//
|
//
|
||||||
// quantization
|
// quantization
|
||||||
//
|
//
|
||||||
|
|
445
llama.cpp
445
llama.cpp
|
@ -19,6 +19,11 @@
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
#include "ggml-metal.h"
|
#include "ggml-metal.h"
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GGML_USE_K_QUANTS
|
||||||
|
#ifndef QK_K
|
||||||
|
#define QK_K 256
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
@ -40,6 +45,10 @@
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <numeric>
|
#include <numeric>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
#define LLAMA_USE_SCRATCH
|
#define LLAMA_USE_SCRATCH
|
||||||
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
||||||
|
|
||||||
|
@ -165,9 +174,27 @@ struct llama_kv_cache {
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
ggml_cuda_free_data(k);
|
||||||
|
ggml_cuda_free_data(v);
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct llama_vocab {
|
||||||
|
using id = int32_t;
|
||||||
|
using token = std::string;
|
||||||
|
|
||||||
|
struct token_score {
|
||||||
|
token tok;
|
||||||
|
float score;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::unordered_map<token, id> token_to_id;
|
||||||
|
std::vector<token_score> id_to_token;
|
||||||
|
};
|
||||||
|
|
||||||
struct llama_model {
|
struct llama_model {
|
||||||
e_model type = MODEL_UNKNOWN;
|
e_model type = MODEL_UNKNOWN;
|
||||||
|
|
||||||
|
@ -184,10 +211,6 @@ struct llama_model {
|
||||||
// context
|
// context
|
||||||
struct ggml_context * ctx = NULL;
|
struct ggml_context * ctx = NULL;
|
||||||
|
|
||||||
// key + value cache for the self attention
|
|
||||||
// TODO: move to llama_state
|
|
||||||
struct llama_kv_cache kv_self;
|
|
||||||
|
|
||||||
// the model memory buffer
|
// the model memory buffer
|
||||||
llama_ctx_buffer buf;
|
llama_ctx_buffer buf;
|
||||||
|
|
||||||
|
@ -201,6 +224,11 @@ struct llama_model {
|
||||||
// for quantize-stats only
|
// for quantize-stats only
|
||||||
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
||||||
|
|
||||||
|
int64_t t_load_us = 0;
|
||||||
|
int64_t t_start_us = 0;
|
||||||
|
|
||||||
|
llama_vocab vocab;
|
||||||
|
|
||||||
~llama_model() {
|
~llama_model() {
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
|
@ -210,6 +238,7 @@ struct llama_model {
|
||||||
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
||||||
ggml_cuda_free_data(tensors_by_name[i].second);
|
ggml_cuda_free_data(tensors_by_name[i].second);
|
||||||
}
|
}
|
||||||
|
ggml_cuda_free_scratch();
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
||||||
ggml_cl_free_data(tensors_by_name[i].second);
|
ggml_cl_free_data(tensors_by_name[i].second);
|
||||||
|
@ -218,24 +247,11 @@ struct llama_model {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_vocab {
|
|
||||||
using id = int32_t;
|
|
||||||
using token = std::string;
|
|
||||||
|
|
||||||
struct token_score {
|
|
||||||
token tok;
|
|
||||||
float score;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::unordered_map<token, id> token_to_id;
|
|
||||||
std::vector<token_score> id_to_token;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct llama_context {
|
struct llama_context {
|
||||||
|
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
||||||
|
|
||||||
std::mt19937 rng;
|
std::mt19937 rng;
|
||||||
|
|
||||||
int64_t t_load_us = 0;
|
|
||||||
int64_t t_start_us = 0;
|
|
||||||
bool has_evaluated_once = false;
|
bool has_evaluated_once = false;
|
||||||
|
|
||||||
int64_t t_sample_us = 0;
|
int64_t t_sample_us = 0;
|
||||||
|
@ -246,8 +262,16 @@ struct llama_context {
|
||||||
int32_t n_eval = 0; // number of eval calls
|
int32_t n_eval = 0; // number of eval calls
|
||||||
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
||||||
|
|
||||||
llama_model model;
|
const llama_model & model;
|
||||||
llama_vocab vocab;
|
const llama_vocab & vocab;
|
||||||
|
|
||||||
|
bool model_owner = false;
|
||||||
|
|
||||||
|
int64_t t_load_us;
|
||||||
|
int64_t t_start_us;
|
||||||
|
|
||||||
|
// key + value cache for the self attention
|
||||||
|
struct llama_kv_cache kv_self;
|
||||||
|
|
||||||
size_t mem_per_token = 0;
|
size_t mem_per_token = 0;
|
||||||
|
|
||||||
|
@ -867,7 +891,8 @@ static bool kv_cache_init(
|
||||||
const struct llama_hparams & hparams,
|
const struct llama_hparams & hparams,
|
||||||
struct llama_kv_cache & cache,
|
struct llama_kv_cache & cache,
|
||||||
ggml_type wtype,
|
ggml_type wtype,
|
||||||
int n_ctx) {
|
int n_ctx,
|
||||||
|
int n_gpu_layers) {
|
||||||
const int n_embd = hparams.n_embd;
|
const int n_embd = hparams.n_embd;
|
||||||
const int n_layer = hparams.n_layer;
|
const int n_layer = hparams.n_layer;
|
||||||
|
|
||||||
|
@ -875,6 +900,7 @@ static bool kv_cache_init(
|
||||||
const int64_t n_elements = n_embd*n_mem;
|
const int64_t n_elements = n_embd*n_mem;
|
||||||
|
|
||||||
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
||||||
|
cache.n = 0;
|
||||||
|
|
||||||
struct ggml_init_params params;
|
struct ggml_init_params params;
|
||||||
params.mem_size = cache.buf.size;
|
params.mem_size = cache.buf.size;
|
||||||
|
@ -893,25 +919,36 @@ static bool kv_cache_init(
|
||||||
ggml_set_name(cache.k, "cache_k");
|
ggml_set_name(cache.k, "cache_k");
|
||||||
ggml_set_name(cache.v, "cache_v");
|
ggml_set_name(cache.v, "cache_v");
|
||||||
|
|
||||||
|
(void) n_gpu_layers;
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (n_gpu_layers > n_layer + 1) {
|
||||||
|
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
||||||
|
}
|
||||||
|
if (n_gpu_layers > n_layer + 2) {
|
||||||
|
ggml_cuda_assign_buffers_no_scratch(cache.k);
|
||||||
|
}
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_context_params llama_context_default_params() {
|
struct llama_context_params llama_context_default_params() {
|
||||||
struct llama_context_params result = {
|
struct llama_context_params result = {
|
||||||
|
/*.seed =*/ -1,
|
||||||
/*.n_ctx =*/ 512,
|
/*.n_ctx =*/ 512,
|
||||||
/*.n_batch =*/ 512,
|
/*.n_batch =*/ 512,
|
||||||
/*.gpu_layers =*/ 0,
|
/*.gpu_layers =*/ 0,
|
||||||
/*.main_gpu =*/ 0,
|
/*.main_gpu =*/ 0,
|
||||||
/*.tensor_split =*/ {0},
|
/*.tensor_split =*/ {0},
|
||||||
/*.seed =*/ -1,
|
/*.progress_callback =*/ nullptr,
|
||||||
|
/*.progress_callback_user_data =*/ nullptr,
|
||||||
|
/*.low_vram =*/ false,
|
||||||
/*.f16_kv =*/ true,
|
/*.f16_kv =*/ true,
|
||||||
/*.logits_all =*/ false,
|
/*.logits_all =*/ false,
|
||||||
/*.vocab_only =*/ false,
|
/*.vocab_only =*/ false,
|
||||||
/*.use_mmap =*/ true,
|
/*.use_mmap =*/ true,
|
||||||
/*.use_mlock =*/ false,
|
/*.use_mlock =*/ false,
|
||||||
/*.embedding =*/ false,
|
/*.embedding =*/ false,
|
||||||
/*.progress_callback =*/ nullptr,
|
|
||||||
/*.progress_callback_user_data =*/ nullptr,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
@ -1005,12 +1042,14 @@ static const char *llama_model_type_name(e_model type) {
|
||||||
|
|
||||||
static void llama_model_load_internal(
|
static void llama_model_load_internal(
|
||||||
const std::string & fname,
|
const std::string & fname,
|
||||||
llama_context & lctx,
|
llama_model & model,
|
||||||
|
llama_vocab & vocab,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
int n_batch,
|
int n_batch,
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
int main_gpu,
|
int main_gpu,
|
||||||
const float * tensor_split,
|
const float * tensor_split,
|
||||||
|
bool low_vram,
|
||||||
ggml_type memory_type,
|
ggml_type memory_type,
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
bool use_mlock,
|
bool use_mlock,
|
||||||
|
@ -1018,12 +1057,11 @@ static void llama_model_load_internal(
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback progress_callback,
|
||||||
void * progress_callback_user_data) {
|
void * progress_callback_user_data) {
|
||||||
|
|
||||||
lctx.t_start_us = ggml_time_us();
|
model.t_start_us = ggml_time_us();
|
||||||
|
|
||||||
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
|
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
|
||||||
|
|
||||||
lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
|
vocab = std::move(ml->file_loaders.at(0)->vocab);
|
||||||
auto & model = lctx.model;
|
|
||||||
model.hparams = ml->file_loaders.at(0)->hparams;
|
model.hparams = ml->file_loaders.at(0)->hparams;
|
||||||
model.n_gpu_layers = n_gpu_layers;
|
model.n_gpu_layers = n_gpu_layers;
|
||||||
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
||||||
|
@ -1036,6 +1074,12 @@ static void llama_model_load_internal(
|
||||||
case 40: model.type = e_model::MODEL_13B; break;
|
case 40: model.type = e_model::MODEL_13B; break;
|
||||||
case 60: model.type = e_model::MODEL_30B; break;
|
case 60: model.type = e_model::MODEL_30B; break;
|
||||||
case 80: model.type = e_model::MODEL_65B; break;
|
case 80: model.type = e_model::MODEL_65B; break;
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
if (hparams.n_layer < 32) {
|
||||||
|
model.type = e_model::MODEL_7B;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
}
|
}
|
||||||
|
|
||||||
hparams.n_ctx = n_ctx;
|
hparams.n_ctx = n_ctx;
|
||||||
|
@ -1087,15 +1131,15 @@ static void llama_model_load_internal(
|
||||||
|
|
||||||
// create the ggml context
|
// create the ggml context
|
||||||
{
|
{
|
||||||
lctx.model.buf.resize(ctx_size);
|
model.buf.resize(ctx_size);
|
||||||
if (use_mlock) {
|
if (use_mlock) {
|
||||||
lctx.model.mlock_buf.init(lctx.model.buf.addr);
|
model.mlock_buf.init(model.buf.addr);
|
||||||
lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
|
model.mlock_buf.grow_to(model.buf.size);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/*.mem_size =*/ lctx.model.buf.size,
|
/*.mem_size =*/ model.buf.size,
|
||||||
/*.mem_buffer =*/ lctx.model.buf.addr,
|
/*.mem_buffer =*/ model.buf.addr,
|
||||||
/*.no_alloc =*/ ml->use_mmap,
|
/*.no_alloc =*/ ml->use_mmap,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1131,18 +1175,34 @@ static void llama_model_load_internal(
|
||||||
ml->ggml_ctx = ctx;
|
ml->ggml_ctx = ctx;
|
||||||
|
|
||||||
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||||
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
|
||||||
|
|
||||||
// "output" tensor
|
// "output" tensor
|
||||||
{
|
{
|
||||||
|
ggml_backend backend_norm;
|
||||||
ggml_backend backend_output;
|
ggml_backend backend_output;
|
||||||
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
||||||
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
||||||
|
// on Windows however this is detrimental unless everything is on the GPU
|
||||||
|
#ifndef _WIN32
|
||||||
|
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||||
|
#else
|
||||||
|
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||||
|
#endif // _WIN32
|
||||||
|
|
||||||
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
||||||
} else {
|
} else {
|
||||||
|
backend_norm = GGML_BACKEND_CPU;
|
||||||
backend_output = GGML_BACKEND_CPU;
|
backend_output = GGML_BACKEND_CPU;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm);
|
||||||
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
||||||
|
if (backend_norm == GGML_BACKEND_GPU) {
|
||||||
|
vram_weights += ggml_nbytes(model.norm);
|
||||||
|
}
|
||||||
|
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||||
|
vram_weights += ggml_nbytes(model.output);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
|
@ -1200,23 +1260,49 @@ static void llama_model_load_internal(
|
||||||
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
||||||
|
|
||||||
(void) vram_scratch;
|
(void) vram_scratch;
|
||||||
|
(void) n_batch;
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (low_vram) {
|
||||||
|
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
||||||
|
ggml_cuda_set_scratch_size(0); // disable scratch
|
||||||
|
} else {
|
||||||
vram_scratch = n_batch * MB;
|
vram_scratch = n_batch * MB;
|
||||||
ggml_cuda_set_scratch_size(vram_scratch);
|
ggml_cuda_set_scratch_size(vram_scratch);
|
||||||
if (n_gpu_layers > 0) {
|
if (n_gpu_layers > 0) {
|
||||||
fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
|
fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
|
||||||
__func__, vram_scratch / MB);
|
__func__, vram_scratch / MB);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||||
|
|
||||||
fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
|
fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
||||||
if (n_gpu_layers > (int) hparams.n_layer) {
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
||||||
fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
|
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
||||||
}
|
}
|
||||||
|
size_t vram_kv_cache = 0;
|
||||||
|
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
||||||
|
if (low_vram) {
|
||||||
|
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
|
||||||
|
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
||||||
|
if (low_vram) {
|
||||||
|
fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
|
||||||
|
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
||||||
|
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
||||||
|
__func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
|
||||||
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
||||||
__func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
|
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
||||||
#else
|
#else
|
||||||
(void) n_gpu_layers;
|
(void) n_gpu_layers;
|
||||||
#endif
|
#endif
|
||||||
|
@ -1227,13 +1313,14 @@ static void llama_model_load_internal(
|
||||||
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
(void) tensor_split;
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUBLAS)
|
||||||
{
|
{
|
||||||
ggml_cuda_set_tensor_split(tensor_split);
|
ggml_cuda_set_tensor_split(tensor_split);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
||||||
|
|
||||||
if (progress_callback) {
|
if (progress_callback) {
|
||||||
progress_callback(1.0f, progress_callback_user_data);
|
progress_callback(1.0f, progress_callback_user_data);
|
||||||
|
@ -1243,17 +1330,19 @@ static void llama_model_load_internal(
|
||||||
|
|
||||||
// loading time will be recalculate after the first eval, so
|
// loading time will be recalculate after the first eval, so
|
||||||
// we take page faults deferred by mmap() into consideration
|
// we take page faults deferred by mmap() into consideration
|
||||||
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
model.t_load_us = ggml_time_us() - model.t_start_us;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_model_load(
|
static bool llama_model_load(
|
||||||
const std::string & fname,
|
const std::string & fname,
|
||||||
llama_context & lctx,
|
llama_model & model,
|
||||||
|
llama_vocab & vocab,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
int n_batch,
|
int n_batch,
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
int main_gpu,
|
int main_gpu,
|
||||||
float * tensor_split,
|
float * tensor_split,
|
||||||
|
bool low_vram,
|
||||||
ggml_type memory_type,
|
ggml_type memory_type,
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
bool use_mlock,
|
bool use_mlock,
|
||||||
|
@ -1261,7 +1350,7 @@ static bool llama_model_load(
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback progress_callback,
|
||||||
void *progress_callback_user_data) {
|
void *progress_callback_user_data) {
|
||||||
try {
|
try {
|
||||||
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
|
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
||||||
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
||||||
return true;
|
return true;
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
|
@ -1299,7 +1388,7 @@ static bool llama_eval_internal(
|
||||||
const auto & model = lctx.model;
|
const auto & model = lctx.model;
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
const auto & kv_self = model.kv_self;
|
const auto & kv_self = lctx.kv_self;
|
||||||
|
|
||||||
LLAMA_ASSERT(!!kv_self.ctx);
|
LLAMA_ASSERT(!!kv_self.ctx);
|
||||||
|
|
||||||
|
@ -1337,12 +1426,33 @@ static bool llama_eval_internal(
|
||||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
(void) i_gpu_start;
|
(void) i_gpu_start;
|
||||||
|
|
||||||
|
// offload functions set the tensor output backend to GPU
|
||||||
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
||||||
|
//
|
||||||
|
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
||||||
|
// in that case ggml_cuda_assign_buffers has no effect
|
||||||
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
||||||
|
offload_func_t offload_func_kq = llama_nop;
|
||||||
|
offload_func_t offload_func_v = llama_nop;
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (n_gpu_layers > n_layer) {
|
||||||
|
offload_func_nr = ggml_cuda_assign_buffers;
|
||||||
|
}
|
||||||
|
if (n_gpu_layers > n_layer + 1) {
|
||||||
|
offload_func_v = ggml_cuda_assign_buffers;
|
||||||
|
}
|
||||||
|
if (n_gpu_layers > n_layer + 2) {
|
||||||
|
offload_func_kq = ggml_cuda_assign_buffers;
|
||||||
|
}
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
offload_func_t offload_func = llama_nop;
|
offload_func_t offload_func = llama_nop;
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
if (il >= i_gpu_start) {
|
if (il >= i_gpu_start) {
|
||||||
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
|
offload_func = ggml_cuda_assign_buffers;
|
||||||
}
|
}
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
|
@ -1365,31 +1475,42 @@ static bool llama_eval_internal(
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|
||||||
// offload_func(tmpq);
|
|
||||||
ggml_set_name(tmpq, "tmpq");
|
|
||||||
|
|
||||||
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||||
// offload_func(tmpk);
|
offload_func_kq(tmpk);
|
||||||
ggml_set_name(tmpk, "tmpk");
|
ggml_set_name(tmpk, "tmpk");
|
||||||
|
|
||||||
|
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||||
|
offload_func_kq(tmpq);
|
||||||
|
ggml_set_name(tmpq, "tmpq");
|
||||||
|
|
||||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||||
|
offload_func_kq(Kcur);
|
||||||
ggml_set_name(Kcur, "Kcur");
|
ggml_set_name(Kcur, "Kcur");
|
||||||
|
|
||||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||||
|
offload_func_kq(Qcur);
|
||||||
ggml_set_name(Qcur, "Qcur");
|
ggml_set_name(Qcur, "Qcur");
|
||||||
|
|
||||||
// store key and value to memory
|
// store key and value to memory
|
||||||
{
|
{
|
||||||
// compute the transposed [N, n_embd] V matrix
|
// compute the transposed [N, n_embd] V matrix
|
||||||
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
|
|
||||||
|
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||||
|
offload_func_v(tmpv);
|
||||||
|
ggml_set_name(tmpv, "tmpv");
|
||||||
|
|
||||||
|
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
|
||||||
|
offload_func_v(Vcur);
|
||||||
ggml_set_name(Vcur, "Vcur");
|
ggml_set_name(Vcur, "Vcur");
|
||||||
|
|
||||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
||||||
|
offload_func_kq(k);
|
||||||
ggml_set_name(k, "k");
|
ggml_set_name(k, "k");
|
||||||
|
|
||||||
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
||||||
( n_ctx)*ggml_element_size(kv_self.v),
|
( n_ctx)*ggml_element_size(kv_self.v),
|
||||||
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
||||||
|
offload_func_v(v);
|
||||||
ggml_set_name(v, "v");
|
ggml_set_name(v, "v");
|
||||||
|
|
||||||
// important: storing RoPE-ed version of K in the KV cache!
|
// important: storing RoPE-ed version of K in the KV cache!
|
||||||
|
@ -1401,6 +1522,7 @@ static bool llama_eval_internal(
|
||||||
ggml_permute(ctx0,
|
ggml_permute(ctx0,
|
||||||
Qcur,
|
Qcur,
|
||||||
0, 2, 1, 3);
|
0, 2, 1, 3);
|
||||||
|
offload_func_kq(Q);
|
||||||
ggml_set_name(Q, "Q");
|
ggml_set_name(Q, "Q");
|
||||||
|
|
||||||
struct ggml_tensor * K =
|
struct ggml_tensor * K =
|
||||||
|
@ -1409,10 +1531,12 @@ static bool llama_eval_internal(
|
||||||
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
||||||
n_embd/n_head, n_head, n_past + N),
|
n_embd/n_head, n_head, n_past + N),
|
||||||
0, 2, 1, 3);
|
0, 2, 1, 3);
|
||||||
|
offload_func_kq(K);
|
||||||
ggml_set_name(K, "K");
|
ggml_set_name(K, "K");
|
||||||
|
|
||||||
// K * Q
|
// K * Q
|
||||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||||
|
offload_func_kq(KQ);
|
||||||
ggml_set_name(KQ, "KQ");
|
ggml_set_name(KQ, "KQ");
|
||||||
|
|
||||||
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
||||||
|
@ -1421,14 +1545,17 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
||||||
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
||||||
|
offload_func_kq(KQ_scaled);
|
||||||
ggml_set_name(KQ_scaled, "KQ_scaled");
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
||||||
|
|
||||||
// KQ_masked = mask_past(KQ_scaled)
|
// KQ_masked = mask_past(KQ_scaled)
|
||||||
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
||||||
|
offload_func_kq(KQ_masked);
|
||||||
ggml_set_name(KQ_masked, "KQ_masked");
|
ggml_set_name(KQ_masked, "KQ_masked");
|
||||||
|
|
||||||
// KQ = soft_max(KQ_masked)
|
// KQ = soft_max(KQ_masked)
|
||||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
||||||
|
offload_func_v(KQ_soft_max);
|
||||||
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
||||||
|
|
||||||
// split cached V into n_head heads
|
// split cached V into n_head heads
|
||||||
|
@ -1438,10 +1565,12 @@ static bool llama_eval_internal(
|
||||||
n_ctx*ggml_element_size(kv_self.v),
|
n_ctx*ggml_element_size(kv_self.v),
|
||||||
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
|
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
|
||||||
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
|
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
|
||||||
|
offload_func_v(V);
|
||||||
ggml_set_name(V, "V");
|
ggml_set_name(V, "V");
|
||||||
|
|
||||||
#if 1
|
#if 1
|
||||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||||
|
offload_func_v(KQV);
|
||||||
ggml_set_name(KQV, "KQV");
|
ggml_set_name(KQV, "KQV");
|
||||||
#else
|
#else
|
||||||
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
||||||
|
@ -1453,12 +1582,14 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
||||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
|
offload_func_v(KQV_merged);
|
||||||
ggml_set_name(KQV_merged, "KQV_merged");
|
ggml_set_name(KQV_merged, "KQV_merged");
|
||||||
|
|
||||||
// cur = KQV_merged.contiguous().view(n_embd, N)
|
// cur = KQV_merged.contiguous().view(n_embd, N)
|
||||||
cur = ggml_cpy(ctx0,
|
cur = ggml_cpy(ctx0,
|
||||||
KQV_merged,
|
KQV_merged,
|
||||||
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
||||||
|
offload_func_v(cur);
|
||||||
ggml_set_name(cur, "KQV_merged_contiguous");
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
||||||
|
|
||||||
// projection (no bias)
|
// projection (no bias)
|
||||||
|
@ -1470,7 +1601,6 @@ static bool llama_eval_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
lctx.use_buf(ctx0, 1);
|
lctx.use_buf(ctx0, 1);
|
||||||
//ggml_cuda_set_scratch(1);
|
|
||||||
|
|
||||||
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
||||||
offload_func(inpFF);
|
offload_func(inpFF);
|
||||||
|
@ -1500,7 +1630,7 @@ static bool llama_eval_internal(
|
||||||
model.layers[il].w1,
|
model.layers[il].w1,
|
||||||
cur);
|
cur);
|
||||||
offload_func(cur);
|
offload_func(cur);
|
||||||
ggml_set_name(cur, "result_w2");
|
ggml_set_name(cur, "result_w1");
|
||||||
|
|
||||||
// SILU activation
|
// SILU activation
|
||||||
cur = ggml_silu(ctx0, cur);
|
cur = ggml_silu(ctx0, cur);
|
||||||
|
@ -1528,32 +1658,20 @@ static bool llama_eval_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
lctx.use_buf(ctx0, 0);
|
lctx.use_buf(ctx0, 0);
|
||||||
//ggml_cuda_set_scratch(0);
|
|
||||||
|
|
||||||
// used at the end to optionally extract the embeddings
|
// used at the end to optionally extract the embeddings
|
||||||
struct ggml_tensor * embeddings = NULL;
|
struct ggml_tensor * embeddings = NULL;
|
||||||
|
|
||||||
offload_func_t offload_func = llama_nop;
|
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
|
||||||
if (n_gpu_layers > n_layer) {
|
|
||||||
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
|
|
||||||
}
|
|
||||||
#endif // GGML_USE_CUBLAS
|
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
cur = ggml_rms_norm(ctx0, inpL);
|
cur = ggml_rms_norm(ctx0, inpL);
|
||||||
offload_func(cur);
|
offload_func_nr(cur);
|
||||||
ggml_set_name(cur, "rms_norm_inpL");
|
ggml_set_name(cur, "rms_norm_2");
|
||||||
|
|
||||||
cur = ggml_rms_norm(ctx0, cur);
|
|
||||||
offload_func(cur);
|
|
||||||
ggml_set_name(cur, "rms_norm_after");
|
|
||||||
|
|
||||||
// cur = cur*norm(broadcasted)
|
// cur = cur*norm(broadcasted)
|
||||||
cur = ggml_mul(ctx0, cur, model.norm);
|
cur = ggml_mul(ctx0, cur, model.norm);
|
||||||
offload_func(cur);
|
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
||||||
ggml_set_name(cur, "result_norm");
|
ggml_set_name(cur, "result_norm");
|
||||||
|
|
||||||
embeddings = cur;
|
embeddings = cur;
|
||||||
|
@ -1618,7 +1736,7 @@ static bool llama_eval_internal(
|
||||||
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
||||||
|
|
||||||
// update kv token count
|
// update kv token count
|
||||||
lctx.model.kv_self.n = n_past + N;
|
lctx.kv_self.n = n_past + N;
|
||||||
|
|
||||||
// extract logits
|
// extract logits
|
||||||
{
|
{
|
||||||
|
@ -1897,9 +2015,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < candidates->size; ++i) {
|
||||||
cum_sum += candidates->data[i].p;
|
cum_sum += candidates->data[i].p;
|
||||||
|
|
||||||
// Check if the running sum is greater than p or if we have kept at least min_keep tokens
|
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
||||||
if (cum_sum > p && i >= min_keep) {
|
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
||||||
last_idx = i;
|
if (cum_sum >= p && i + 1 >= min_keep) {
|
||||||
|
last_idx = i + 1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2161,6 +2280,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
|
||||||
return -log2f(candidate.p) > *mu;
|
return -log2f(candidate.p) > *mu;
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
if (candidates->size == 0) {
|
||||||
|
candidates->size = 1;
|
||||||
|
}
|
||||||
|
|
||||||
// Normalize the probabilities of the remaining words
|
// Normalize the probabilities of the remaining words
|
||||||
llama_sample_softmax(ctx, candidates);
|
llama_sample_softmax(ctx, candidates);
|
||||||
|
|
||||||
|
@ -2298,7 +2421,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
||||||
|
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
|
||||||
|
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
||||||
|
|
||||||
|
#ifdef GGML_USE_K_QUANTS
|
||||||
// K-quants
|
// K-quants
|
||||||
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
||||||
|
@ -2309,6 +2435,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
||||||
|
#endif
|
||||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2320,6 +2447,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
/*vocab_only*/ false));
|
/*vocab_only*/ false));
|
||||||
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
||||||
|
|
||||||
|
#ifdef GGML_USE_K_QUANTS
|
||||||
int n_attention_wv = 0;
|
int n_attention_wv = 0;
|
||||||
int n_feed_forward_w2 = 0;
|
int n_feed_forward_w2 = 0;
|
||||||
for (auto& tensor : model_loader->tensors_map.tensors) {
|
for (auto& tensor : model_loader->tensors_map.tensors) {
|
||||||
|
@ -2333,6 +2461,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
int i_attention_wv = 0;
|
int i_attention_wv = 0;
|
||||||
int i_feed_forward_w2 = 0;
|
int i_feed_forward_w2 = 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
size_t total_size_org = 0;
|
size_t total_size_org = 0;
|
||||||
size_t total_size_new = 0;
|
size_t total_size_new = 0;
|
||||||
|
@ -2358,12 +2487,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
// quantize only 2D tensors
|
// quantize only 2D tensors
|
||||||
quantize &= (tensor.ne.size() == 2);
|
quantize &= (tensor.ne.size() == 2);
|
||||||
|
quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
|
||||||
// uncomment this to keep the output layer in FP16
|
quantize &= quantized_type != tensor.type;
|
||||||
if (!params->quantize_output_tensor && tensor.name == "output.weight") {
|
|
||||||
quantize = false;
|
|
||||||
}
|
|
||||||
quantize = quantize && quantized_type != tensor.type;
|
|
||||||
|
|
||||||
enum ggml_type new_type;
|
enum ggml_type new_type;
|
||||||
void * new_data;
|
void * new_data;
|
||||||
|
@ -2377,31 +2502,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
||||||
} else {
|
} else {
|
||||||
new_type = quantized_type;
|
new_type = quantized_type;
|
||||||
// TODO: temporary disabled until Metal / OpenCL support is available
|
#ifdef GGML_USE_K_QUANTS
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/issues/1711
|
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
||||||
//if (tensor.name == "output.weight") {
|
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
||||||
// new_type = GGML_TYPE_Q6_K;
|
int nx = tensor.ne.at(0);
|
||||||
//}
|
int ny = tensor.ne.at(1);
|
||||||
if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
||||||
|
fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
|
||||||
|
fprintf(stderr, "This is required to be able to use k-quants for now!\n");
|
||||||
|
fprintf(stderr, "========================================================================================\n\n");
|
||||||
|
throw std::runtime_error("Unsupported tensor size encountered\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (tensor.name == "output.weight") {
|
||||||
|
int nx = tensor.ne.at(0);
|
||||||
|
int ny = tensor.ne.at(1);
|
||||||
|
if (nx % QK_K == 0 && ny % QK_K == 0) {
|
||||||
|
new_type = GGML_TYPE_Q6_K;
|
||||||
|
}
|
||||||
|
} else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
|
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
|
||||||
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
||||||
++i_attention_wv;
|
++i_attention_wv;
|
||||||
}
|
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
||||||
if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
|
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
|
||||||
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
||||||
++i_feed_forward_w2;
|
++i_feed_forward_w2;
|
||||||
}
|
} else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
||||||
if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
float * f32_data;
|
float * f32_data;
|
||||||
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
||||||
|
@ -2508,12 +2645,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
// interface implementation
|
// interface implementation
|
||||||
//
|
//
|
||||||
|
|
||||||
struct llama_context * llama_init_from_file(
|
struct llama_model * llama_load_model_from_file(
|
||||||
const char * path_model,
|
const char * path_model,
|
||||||
struct llama_context_params params) {
|
struct llama_context_params params) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
llama_context * ctx = new llama_context;
|
llama_model * model = new llama_model;
|
||||||
|
|
||||||
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||||
|
|
||||||
|
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
||||||
|
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
||||||
|
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
||||||
|
delete model;
|
||||||
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return model;
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_free_model(struct llama_model * model) {
|
||||||
|
delete model;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct llama_context * llama_new_context_with_model(
|
||||||
|
struct llama_model * model,
|
||||||
|
struct llama_context_params params) {
|
||||||
|
|
||||||
|
if (!model) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context * ctx = new llama_context(*model, model->vocab);
|
||||||
|
|
||||||
if (params.seed < 0) {
|
if (params.seed < 0) {
|
||||||
params.seed = time(NULL);
|
params.seed = time(NULL);
|
||||||
|
@ -2541,24 +2705,16 @@ struct llama_context * llama_init_from_file(
|
||||||
|
|
||||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||||
|
|
||||||
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
|
||||||
params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
|
|
||||||
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
|
||||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
|
||||||
llama_free(ctx);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// reserve memory for context buffers
|
// reserve memory for context buffers
|
||||||
if (!params.vocab_only) {
|
if (!params.vocab_only) {
|
||||||
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
|
if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
||||||
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
|
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
||||||
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2588,6 +2744,7 @@ struct llama_context * llama_init_from_file(
|
||||||
|
|
||||||
void * data_ptr = NULL;
|
void * data_ptr = NULL;
|
||||||
size_t data_size = 0;
|
size_t data_size = 0;
|
||||||
|
|
||||||
if (params.use_mmap) {
|
if (params.use_mmap) {
|
||||||
data_ptr = ctx->model.mapping->addr;
|
data_ptr = ctx->model.mapping->addr;
|
||||||
data_size = ctx->model.mapping->size;
|
data_size = ctx->model.mapping->size;
|
||||||
|
@ -2596,6 +2753,10 @@ struct llama_context * llama_init_from_file(
|
||||||
data_size = ggml_get_mem_size (ctx->model.ctx);
|
data_size = ggml_get_mem_size (ctx->model.ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
||||||
|
|
||||||
|
printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
||||||
|
|
||||||
#define LLAMA_METAL_CHECK_BUF(result) \
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
||||||
if (!(result)) { \
|
if (!(result)) { \
|
||||||
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
|
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
|
||||||
|
@ -2603,12 +2764,13 @@ struct llama_context * llama_init_from_file(
|
||||||
return NULL; \
|
return NULL; \
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
||||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
|
|
||||||
|
|
||||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
||||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
|
||||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
|
|
||||||
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
|
||||||
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
||||||
#undef LLAMA_METAL_CHECK_BUF
|
#undef LLAMA_METAL_CHECK_BUF
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -2616,7 +2778,23 @@ struct llama_context * llama_init_from_file(
|
||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct llama_context * llama_init_from_file(
|
||||||
|
const char * path_model,
|
||||||
|
struct llama_context_params params) {
|
||||||
|
|
||||||
|
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
||||||
|
if (!model) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
struct llama_context * ctx = llama_new_context_with_model(model, params);
|
||||||
|
ctx->model_owner = true;
|
||||||
|
return ctx;
|
||||||
|
}
|
||||||
|
|
||||||
void llama_free(struct llama_context * ctx) {
|
void llama_free(struct llama_context * ctx) {
|
||||||
|
if (ctx->model_owner) {
|
||||||
|
delete &ctx->model;
|
||||||
|
}
|
||||||
delete ctx;
|
delete ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2633,11 +2811,9 @@ int llama_model_quantize(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
|
||||||
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
||||||
|
|
||||||
auto & model = ctx->model;
|
|
||||||
|
|
||||||
const int64_t t_start_lora_us = ggml_time_us();
|
const int64_t t_start_lora_us = ggml_time_us();
|
||||||
|
|
||||||
auto fin = std::ifstream(path_lora, std::ios::binary);
|
auto fin = std::ifstream(path_lora, std::ios::binary);
|
||||||
|
@ -2880,7 +3056,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||||
|
|
||||||
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
||||||
try {
|
try {
|
||||||
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
||||||
|
} catch (const std::exception & err) {
|
||||||
|
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
|
||||||
|
try {
|
||||||
|
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -2888,7 +3073,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
||||||
return ctx->model.kv_self.n;
|
return ctx->kv_self.n;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define LLAMA_MAX_RNG_STATE (64*1024)
|
#define LLAMA_MAX_RNG_STATE (64*1024)
|
||||||
|
@ -2913,7 +3098,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
||||||
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
||||||
const size_t s_kv_size = sizeof(size_t);
|
const size_t s_kv_size = sizeof(size_t);
|
||||||
const size_t s_kv_ntok = sizeof(int);
|
const size_t s_kv_ntok = sizeof(int);
|
||||||
const size_t s_kv = ctx->model.kv_self.buf.size;
|
const size_t s_kv = ctx->kv_self.buf.size;
|
||||||
|
|
||||||
const size_t s_total = (
|
const size_t s_total = (
|
||||||
+ s_rng_size
|
+ s_rng_size
|
||||||
|
@ -2979,7 +3164,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
||||||
|
|
||||||
// copy kv cache
|
// copy kv cache
|
||||||
{
|
{
|
||||||
const auto & kv_self = ctx->model.kv_self;
|
const auto & kv_self = ctx->kv_self;
|
||||||
const auto & hparams = ctx->model.hparams;
|
const auto & hparams = ctx->model.hparams;
|
||||||
const int n_layer = hparams.n_layer;
|
const int n_layer = hparams.n_layer;
|
||||||
const int n_embd = hparams.n_embd;
|
const int n_embd = hparams.n_embd;
|
||||||
|
@ -2994,9 +3179,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
||||||
if (kv_size) {
|
if (kv_size) {
|
||||||
const size_t elt_size = ggml_element_size(kv_self.k);
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
||||||
|
|
||||||
char buffer[4096];
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
||||||
|
|
||||||
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
|
||||||
ggml_cgraph gf{};
|
ggml_cgraph gf{};
|
||||||
gf.n_threads = 1;
|
gf.n_threads = 1;
|
||||||
|
|
||||||
|
@ -3085,7 +3268,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||||
|
|
||||||
// set kv cache
|
// set kv cache
|
||||||
{
|
{
|
||||||
const auto & kv_self = ctx->model.kv_self;
|
const auto & kv_self = ctx->kv_self;
|
||||||
const auto & hparams = ctx->model.hparams;
|
const auto & hparams = ctx->model.hparams;
|
||||||
const int n_layer = hparams.n_layer;
|
const int n_layer = hparams.n_layer;
|
||||||
const int n_embd = hparams.n_embd;
|
const int n_embd = hparams.n_embd;
|
||||||
|
@ -3102,9 +3285,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||||
|
|
||||||
const size_t elt_size = ggml_element_size(kv_self.k);
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
||||||
|
|
||||||
char buffer[4096];
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
||||||
|
|
||||||
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
|
||||||
ggml_cgraph gf{};
|
ggml_cgraph gf{};
|
||||||
gf.n_threads = 1;
|
gf.n_threads = 1;
|
||||||
|
|
||||||
|
@ -3131,7 +3312,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||||
ggml_free(cpy_ctx);
|
ggml_free(cpy_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->model.kv_self.n = kv_ntok;
|
ctx->kv_self.n = kv_ntok;
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t nread = inp - src;
|
const size_t nread = inp - src;
|
||||||
|
@ -3288,6 +3469,19 @@ int llama_n_embd(const struct llama_context * ctx) {
|
||||||
return ctx->model.hparams.n_embd;
|
return ctx->model.hparams.n_embd;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int llama_get_vocab(
|
||||||
|
const struct llama_context * ctx,
|
||||||
|
const char * * strings,
|
||||||
|
float * scores,
|
||||||
|
int capacity) {
|
||||||
|
int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
|
||||||
|
for (int i = 0; i<n; ++i) {
|
||||||
|
strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
|
||||||
|
scores[i] = ctx->vocab.id_to_token[i].score;
|
||||||
|
}
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
float * llama_get_logits(struct llama_context * ctx) {
|
float * llama_get_logits(struct llama_context * ctx) {
|
||||||
return ctx->logits.data();
|
return ctx->logits.data();
|
||||||
}
|
}
|
||||||
|
@ -3326,9 +3520,12 @@ void llama_print_timings(struct llama_context * ctx) {
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
||||||
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
|
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
|
__func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
|
||||||
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
|
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
|
__func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
|
||||||
|
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
|
__func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
|
||||||
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3362,6 +3559,6 @@ const char * llama_print_system_info(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// For internal test use
|
// For internal test use
|
||||||
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
||||||
return ctx->model.tensors_by_name;
|
return ctx->model.tensors_by_name;
|
||||||
}
|
}
|
||||||
|
|
63
llama.h
63
llama.h
|
@ -26,6 +26,14 @@
|
||||||
# define LLAMA_API
|
# define LLAMA_API
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __GNUC__
|
||||||
|
# define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
||||||
|
#elif defined(_MSC_VER)
|
||||||
|
# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
|
||||||
|
#else
|
||||||
|
# define DEPRECATED(func, hint) func
|
||||||
|
#endif
|
||||||
|
|
||||||
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
||||||
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||||
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
||||||
|
@ -53,6 +61,7 @@ extern "C" {
|
||||||
// TODO: show sample usage
|
// TODO: show sample usage
|
||||||
//
|
//
|
||||||
|
|
||||||
|
struct llama_model;
|
||||||
struct llama_context;
|
struct llama_context;
|
||||||
|
|
||||||
typedef int llama_token;
|
typedef int llama_token;
|
||||||
|
@ -72,26 +81,26 @@ extern "C" {
|
||||||
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
||||||
|
|
||||||
struct llama_context_params {
|
struct llama_context_params {
|
||||||
|
int seed; // RNG seed, -1 for random
|
||||||
int n_ctx; // text context
|
int n_ctx; // text context
|
||||||
int n_batch; // prompt processing batch size
|
int n_batch; // prompt processing batch size
|
||||||
int n_gpu_layers; // number of layers to store in VRAM
|
int n_gpu_layers; // number of layers to store in VRAM
|
||||||
int main_gpu; // the GPU that is used for scratch and small tensors
|
int main_gpu; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
||||||
int seed; // RNG seed, -1 for random
|
// called with a progress value between 0 and 1, pass NULL to disable
|
||||||
|
llama_progress_callback progress_callback;
|
||||||
|
// context pointer passed to the progress callback
|
||||||
|
void * progress_callback_user_data;
|
||||||
|
|
||||||
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
|
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
||||||
bool f16_kv; // use fp16 for KV cache
|
bool f16_kv; // use fp16 for KV cache
|
||||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||||
bool vocab_only; // only load the vocabulary, no weights
|
bool vocab_only; // only load the vocabulary, no weights
|
||||||
bool use_mmap; // use mmap if possible
|
bool use_mmap; // use mmap if possible
|
||||||
bool use_mlock; // force system to keep model in RAM
|
bool use_mlock; // force system to keep model in RAM
|
||||||
bool embedding; // embedding mode only
|
bool embedding; // embedding mode only
|
||||||
|
|
||||||
// called with a progress value between 0 and 1, pass NULL to disable
|
|
||||||
llama_progress_callback progress_callback;
|
|
||||||
// context pointer passed to the progress callback
|
|
||||||
void * progress_callback_user_data;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// model file types
|
// model file types
|
||||||
enum llama_ftype {
|
enum llama_ftype {
|
||||||
LLAMA_FTYPE_ALL_F32 = 0,
|
LLAMA_FTYPE_ALL_F32 = 0,
|
||||||
|
@ -136,12 +145,23 @@ extern "C" {
|
||||||
|
|
||||||
LLAMA_API int64_t llama_time_us();
|
LLAMA_API int64_t llama_time_us();
|
||||||
|
|
||||||
|
LLAMA_API struct llama_model * llama_load_model_from_file(
|
||||||
|
const char * path_model,
|
||||||
|
struct llama_context_params params);
|
||||||
|
|
||||||
|
LLAMA_API void llama_free_model(struct llama_model * model);
|
||||||
|
|
||||||
|
LLAMA_API struct llama_context * llama_new_context_with_model(
|
||||||
|
struct llama_model * model,
|
||||||
|
struct llama_context_params params);
|
||||||
|
|
||||||
// Various functions for loading a ggml llama model.
|
// Various functions for loading a ggml llama model.
|
||||||
// Allocate (almost) all memory needed for the model.
|
// Allocate (almost) all memory needed for the model.
|
||||||
// Return NULL on failure
|
// Return NULL on failure
|
||||||
LLAMA_API struct llama_context * llama_init_from_file(
|
LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
|
||||||
const char * path_model,
|
const char * path_model,
|
||||||
struct llama_context_params params);
|
struct llama_context_params params),
|
||||||
|
"please use llama_load_model_from_file combined with llama_new_context_with_model instead");
|
||||||
|
|
||||||
// Frees all allocated memory
|
// Frees all allocated memory
|
||||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||||
|
@ -158,8 +178,15 @@ extern "C" {
|
||||||
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||||
// will be applied on top of the previous one
|
// will be applied on top of the previous one
|
||||||
// Returns 0 on success
|
// Returns 0 on success
|
||||||
LLAMA_API int llama_apply_lora_from_file(
|
LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
|
const char * path_lora,
|
||||||
|
const char * path_base_model,
|
||||||
|
int n_threads),
|
||||||
|
"please use llama_model_apply_lora_from_file instead");
|
||||||
|
|
||||||
|
LLAMA_API int llama_model_apply_lora_from_file(
|
||||||
|
const struct llama_model * model,
|
||||||
const char * path_lora,
|
const char * path_lora,
|
||||||
const char * path_base_model,
|
const char * path_base_model,
|
||||||
int n_threads);
|
int n_threads);
|
||||||
|
@ -220,6 +247,14 @@ extern "C" {
|
||||||
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
||||||
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
||||||
|
|
||||||
|
// Get the vocabulary as output parameters.
|
||||||
|
// Returns number of results.
|
||||||
|
LLAMA_API int llama_get_vocab(
|
||||||
|
const struct llama_context * ctx,
|
||||||
|
const char * * strings,
|
||||||
|
float * scores,
|
||||||
|
int capacity);
|
||||||
|
|
||||||
// Token logits obtained from the last call to llama_eval()
|
// Token logits obtained from the last call to llama_eval()
|
||||||
// The logits for the last token are stored in the last row
|
// The logits for the last token are stored in the last row
|
||||||
// Can be mutated in order to change the probabilities of the next token
|
// Can be mutated in order to change the probabilities of the next token
|
||||||
|
@ -235,9 +270,9 @@ extern "C" {
|
||||||
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
||||||
|
|
||||||
// Special tokens
|
// Special tokens
|
||||||
LLAMA_API llama_token llama_token_bos();
|
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
||||||
LLAMA_API llama_token llama_token_eos();
|
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
||||||
LLAMA_API llama_token llama_token_nl();
|
LLAMA_API llama_token llama_token_nl(); // next-line
|
||||||
|
|
||||||
// Sampling functions
|
// Sampling functions
|
||||||
|
|
||||||
|
@ -302,7 +337,7 @@ extern "C" {
|
||||||
#include <string>
|
#include <string>
|
||||||
struct ggml_tensor;
|
struct ggml_tensor;
|
||||||
|
|
||||||
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -10,6 +10,10 @@
|
||||||
|
|
||||||
#include <ggml.h>
|
#include <ggml.h>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
constexpr int kVecSize = 1 << 18;
|
constexpr int kVecSize = 1 << 18;
|
||||||
|
|
||||||
float drawFromGaussianPdf(std::mt19937& rndm) {
|
float drawFromGaussianPdf(std::mt19937& rndm) {
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import os
|
import os
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
||||||
|
|
||||||
def sha256sum(file):
|
def sha256sum(file):
|
||||||
block_size = 16 * 1024 * 1024 # 16 MB block size
|
block_size = 16 * 1024 * 1024 # 16 MB block size
|
||||||
b = bytearray(block_size)
|
b = bytearray(block_size)
|
||||||
|
@ -15,6 +16,7 @@ def sha256sum(file):
|
||||||
|
|
||||||
return file_hash.hexdigest()
|
return file_hash.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
# Define the path to the llama directory (parent folder of script directory)
|
# Define the path to the llama directory (parent folder of script directory)
|
||||||
llama_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
|
llama_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
|
||||||
|
|
||||||
|
|
1
spm-headers/ggml.h
Symbolic link
1
spm-headers/ggml.h
Symbolic link
|
@ -0,0 +1 @@
|
||||||
|
../ggml.h
|
|
@ -1,3 +1,4 @@
|
||||||
|
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
@ -5,7 +6,11 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
|
||||||
#define MAX_NARGS 2
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define MAX_NARGS 3
|
||||||
|
|
||||||
#undef MIN
|
#undef MIN
|
||||||
#undef MAX
|
#undef MAX
|
||||||
|
@ -197,8 +202,23 @@ bool check_gradient(
|
||||||
float max_error_abs,
|
float max_error_abs,
|
||||||
float max_error_rel) {
|
float max_error_rel) {
|
||||||
|
|
||||||
|
static int n_threads = -1;
|
||||||
|
if (n_threads < 0) {
|
||||||
|
n_threads = GGML_DEFAULT_N_THREADS;
|
||||||
|
|
||||||
|
const char *env = getenv("GGML_N_THREADS");
|
||||||
|
if (env) {
|
||||||
|
n_threads = atoi(env);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("GGML_N_THREADS = %d\n", n_threads);
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_cgraph gf = ggml_build_forward (f);
|
struct ggml_cgraph gf = ggml_build_forward (f);
|
||||||
|
gf.n_threads = n_threads;
|
||||||
|
|
||||||
struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
|
struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
|
||||||
|
gb.n_threads = n_threads;
|
||||||
|
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute(ctx0, &gf);
|
||||||
ggml_graph_reset (&gf);
|
ggml_graph_reset (&gf);
|
||||||
|
@ -1090,6 +1110,25 @@ int main(int argc, const char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// cross_entropy_loss
|
||||||
|
{
|
||||||
|
const int nargs = 1;
|
||||||
|
|
||||||
|
int64_t ne2[4];
|
||||||
|
get_random_dims(ne2, 4);
|
||||||
|
|
||||||
|
for (int ndims = 1; ndims <= 3; ++ndims) {
|
||||||
|
x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
|
||||||
|
x[1] = get_random_tensor(ctx0, ndims, ne2, 0.0f, 1.0f);
|
||||||
|
ggml_set_param(ctx0, x[0]);
|
||||||
|
|
||||||
|
struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1]));
|
||||||
|
|
||||||
|
check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-1f, 1e-2f, INFINITY);
|
||||||
|
// finite differences regularly fails!
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// rope
|
// rope
|
||||||
{
|
{
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
@ -1124,6 +1163,45 @@ int main(int argc, const char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// flash_attn
|
||||||
|
{
|
||||||
|
const int nargs = 3;
|
||||||
|
|
||||||
|
int64_t ne2[4];
|
||||||
|
|
||||||
|
get_random_dims(ne2, 4);
|
||||||
|
int64_t D = ne2[0];
|
||||||
|
int64_t N = ne2[1];
|
||||||
|
int64_t M = ne2[2] + N;
|
||||||
|
int64_t B = ne2[3];
|
||||||
|
|
||||||
|
for (int masked = 0; masked <= 1; ++masked) {
|
||||||
|
for (int ndims = 2; ndims <= 4; ++ndims) {
|
||||||
|
int64_t neq[4] = { D, N, B, ne[3] };
|
||||||
|
int64_t nek[4] = { D, M, B, ne[3] };
|
||||||
|
int64_t nev[4] = { M, D, B, ne[3] };
|
||||||
|
if (ndims == 2) {
|
||||||
|
neq[2] = 1; neq[3] = 1;
|
||||||
|
nek[2] = 1; nek[3] = 1;
|
||||||
|
nev[2] = 1; nev[3] = 1;
|
||||||
|
} else if (ndims == 3) {
|
||||||
|
neq[3] = 1;
|
||||||
|
nek[3] = 1;
|
||||||
|
nev[3] = 1;
|
||||||
|
}
|
||||||
|
x[0] = get_random_tensor(ctx0, ndims, neq, -0.1250f, 0.1250f);
|
||||||
|
x[1] = get_random_tensor(ctx0, ndims, nek, -0.1250f, 0.1250f);
|
||||||
|
x[2] = get_random_tensor(ctx0, ndims, nev, -0.1250f, 0.1250f);
|
||||||
|
ggml_set_param(ctx0, x[0]);
|
||||||
|
ggml_set_param(ctx0, x[1]);
|
||||||
|
ggml_set_param(ctx0, x[2]);
|
||||||
|
|
||||||
|
struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
|
||||||
|
|
||||||
|
check_gradient("flash_attn", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
ggml_free(ctx0);
|
ggml_free(ctx0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -9,12 +9,15 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001;
|
const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
|
||||||
const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002;
|
const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
|
||||||
const float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075;
|
const float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
|
||||||
const float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040;
|
const float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
|
||||||
const float MAX_DOT_PRODUCT_ERROR = 0.02;
|
const float MAX_DOT_PRODUCT_ERROR = 0.02f;
|
||||||
|
|
||||||
const char* RESULT_STR[] = {"ok", "FAILED"};
|
const char* RESULT_STR[] = {"ok", "FAILED"};
|
||||||
|
|
||||||
|
|
|
@ -13,6 +13,10 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
#define MAX_ALIGNMENT 64
|
#define MAX_ALIGNMENT 64
|
||||||
#define QK 32
|
#define QK 32
|
||||||
#define WARMUP 5
|
#define WARMUP 5
|
||||||
|
|
|
@ -176,27 +176,28 @@ void test_frequency_presence_penalty(
|
||||||
int main(void) {
|
int main(void) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
test_top_k({0.1, 0.2, 0.3, 0.4}, {0.4}, 1);
|
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 1);
|
||||||
test_top_k({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3, 0.2}, 3);
|
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 3);
|
||||||
|
|
||||||
test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4}, 0);
|
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0);
|
||||||
test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3}, 0.7);
|
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f);
|
||||||
test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3, 0.2, 0.1}, 1);
|
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f);
|
||||||
|
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
|
||||||
|
|
||||||
test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3}, 0.25);
|
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
|
||||||
test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3, 0.25}, 0.75);
|
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
|
||||||
test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3, 0.25}, 0.99);
|
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);
|
||||||
|
|
||||||
test_typical({0.97, 0.01, 0.01, 0.01}, {0.97}, 0.5);
|
test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
|
||||||
test_typical({0.4, 0.2, 0.2, 0.2}, {0.2, 0.2, 0.2}, 0.5);
|
test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
|
||||||
|
|
||||||
test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0}, {0.25, 0.25, 0.25, 0.25, 0}, 50.0);
|
test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f);
|
||||||
test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2}, {0.5, 0.5, 0, 0, 0}, 50.0);
|
test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f);
|
||||||
test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2, 0, 0}, {0.5, 0.5, 0, 0, 0}, 50.0);
|
test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f);
|
||||||
|
|
||||||
test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0}, {0.249997, 0.249997, 0.249997, 0.249997, 0.000011}, 5.0, 5.0);
|
test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 5.0f, 5.0f);
|
||||||
test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2}, {0.499966, 0.499966, 0.000023, 0.000023, 0.000023}, 5.0, 5.0);
|
test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 5.0f, 5.0f);
|
||||||
test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2, 0, 0}, {0.499977, 0.499977, 0.000023, 0.000023, 0.000000}, 5.0, 5.0);
|
test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f);
|
||||||
|
|
||||||
printf("OK\n");
|
printf("OK\n");
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,6 +28,7 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||||
|
|
||||||
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
|
|
||||||
// load the vocab
|
// load the vocab
|
||||||
|
@ -36,10 +37,18 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
lparams.vocab_only = true;
|
lparams.vocab_only = true;
|
||||||
|
|
||||||
ctx = llama_init_from_file(fname.c_str(), lparams);
|
model = llama_load_model_from_file(fname.c_str(), lparams);
|
||||||
|
|
||||||
|
if (model == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx = llama_new_context_with_model(model, lparams);
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||||
|
llama_free_model(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -48,12 +57,14 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
if (n_vocab != 32000) {
|
if (n_vocab != 32000) {
|
||||||
fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
|
fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
|
||||||
|
llama_free_model(model);
|
||||||
|
llama_free(ctx);
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const auto & test_kv : k_tests()) {
|
for (const auto & test_kv : k_tests()) {
|
||||||
std::vector<llama_token> res(test_kv.first.size());
|
std::vector<llama_token> res(test_kv.first.size());
|
||||||
const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true);
|
const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true);
|
||||||
res.resize(n);
|
res.resize(n);
|
||||||
|
|
||||||
bool correct = res.size() == test_kv.second.size();
|
bool correct = res.size() == test_kv.second.size();
|
||||||
|
@ -77,10 +88,13 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
llama_free_model(model);
|
||||||
|
llama_free(ctx);
|
||||||
return 3;
|
return 3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_free_model(model);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue