Merge remote-tracking branch 'origin/master' into sl/micro-batching
25
.github/workflows/build.yml
vendored
|
@ -515,6 +515,31 @@ jobs:
|
||||||
- name: Build Xcode project
|
- name: Build Xcode project
|
||||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
|
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
|
||||||
|
|
||||||
|
android-build:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up JDK
|
||||||
|
uses: actions/setup-java@v3
|
||||||
|
with:
|
||||||
|
java-version: 17
|
||||||
|
distribution: zulu
|
||||||
|
|
||||||
|
- name: Setup Android SDK
|
||||||
|
uses: android-actions/setup-android@v3
|
||||||
|
with:
|
||||||
|
log-accepted-android-sdk-licenses: false
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
run: |
|
||||||
|
cd examples/llama.android
|
||||||
|
|
||||||
|
# Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820).
|
||||||
|
./gradlew build --no-daemon -Pskip-armeabi-v7a
|
||||||
|
|
||||||
# freeBSD-latest:
|
# freeBSD-latest:
|
||||||
# runs-on: macos-12
|
# runs-on: macos-12
|
||||||
# steps:
|
# steps:
|
||||||
|
|
1
.gitignore
vendored
|
@ -105,3 +105,4 @@ poetry.toml
|
||||||
/tests/test-tokenizer-1-bpe
|
/tests/test-tokenizer-1-bpe
|
||||||
/tests/test-rope
|
/tests/test-rope
|
||||||
/tests/test-backend-ops
|
/tests/test-backend-ops
|
||||||
|
/tests/test-autorelease
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
cmake_minimum_required(VERSION 3.13) # for add_link_options
|
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
|
||||||
project("llama.cpp" C CXX)
|
project("llama.cpp" C CXX)
|
||||||
|
|
||||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
@ -47,6 +47,7 @@ option(BUILD_SHARED_LIBS "build shared libraries"
|
||||||
option(LLAMA_STATIC "llama: static link libraries" OFF)
|
option(LLAMA_STATIC "llama: static link libraries" OFF)
|
||||||
option(LLAMA_NATIVE "llama: enable -march=native flag" ON)
|
option(LLAMA_NATIVE "llama: enable -march=native flag" ON)
|
||||||
option(LLAMA_LTO "llama: enable link time optimization" OFF)
|
option(LLAMA_LTO "llama: enable link time optimization" OFF)
|
||||||
|
option(LLAMA_CCACHE "llama: use ccache if available" ON)
|
||||||
|
|
||||||
# debug
|
# debug
|
||||||
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
|
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
|
||||||
|
@ -76,6 +77,10 @@ if (NOT MSVC)
|
||||||
option(LLAMA_F16C "llama: enable F16C" ${INS_ENB})
|
option(LLAMA_F16C "llama: enable F16C" ${INS_ENB})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (WIN32)
|
||||||
|
option(LLAMA_WIN_VER "llama: Windows Version" 0x602)
|
||||||
|
endif()
|
||||||
|
|
||||||
# 3rd party libs
|
# 3rd party libs
|
||||||
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
||||||
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
||||||
|
@ -557,6 +562,17 @@ if (LLAMA_LTO)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_CCACHE)
|
||||||
|
find_program(LLAMA_CCACHE_FOUND ccache)
|
||||||
|
if (LLAMA_CCACHE_FOUND)
|
||||||
|
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
|
||||||
|
set(ENV{CCACHE_SLOPPINESS} time_macros)
|
||||||
|
message(STATUS "Using ccache")
|
||||||
|
else()
|
||||||
|
message(STATUS "Warning: ccache not found - consider installing it or use LLAMA_CCACHE=OFF")
|
||||||
|
endif ()
|
||||||
|
endif()
|
||||||
|
|
||||||
# this version of Apple ld64 is buggy
|
# this version of Apple ld64 is buggy
|
||||||
execute_process(
|
execute_process(
|
||||||
COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
|
COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
|
||||||
|
@ -590,6 +606,13 @@ if (NOT MSVC)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
function(add_compile_option_cpp ARG)
|
||||||
|
# Adds a compile option to C/C++ only, but not for Cuda.
|
||||||
|
# Use, e.g., for CPU-architecture flags.
|
||||||
|
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:${ARG}>)
|
||||||
|
add_compile_options($<$<COMPILE_LANGUAGE:C>:${ARG}>)
|
||||||
|
endfunction()
|
||||||
|
|
||||||
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
|
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
|
||||||
message(STATUS "ARM detected")
|
message(STATUS "ARM detected")
|
||||||
if (MSVC)
|
if (MSVC)
|
||||||
|
@ -624,8 +647,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
|
||||||
include(cmake/FindSIMD.cmake)
|
include(cmake/FindSIMD.cmake)
|
||||||
endif ()
|
endif ()
|
||||||
if (LLAMA_AVX512)
|
if (LLAMA_AVX512)
|
||||||
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
|
add_compile_option_cpp(/arch:AVX512)
|
||||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
|
|
||||||
# MSVC has no compile-time flags enabling specific
|
# MSVC has no compile-time flags enabling specific
|
||||||
# AVX512 extensions, neither it defines the
|
# AVX512 extensions, neither it defines the
|
||||||
# macros corresponding to the extensions.
|
# macros corresponding to the extensions.
|
||||||
|
@ -639,37 +661,35 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
||||||
endif()
|
endif()
|
||||||
elseif (LLAMA_AVX2)
|
elseif (LLAMA_AVX2)
|
||||||
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>)
|
add_compile_option_cpp(/arch:AVX2)
|
||||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
|
|
||||||
elseif (LLAMA_AVX)
|
elseif (LLAMA_AVX)
|
||||||
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>)
|
add_compile_option_cpp(/arch:AVX)
|
||||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
|
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
if (LLAMA_NATIVE)
|
if (LLAMA_NATIVE)
|
||||||
add_compile_options(-march=native)
|
add_compile_option_cpp(-march=native)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_F16C)
|
if (LLAMA_F16C)
|
||||||
add_compile_options(-mf16c)
|
add_compile_option_cpp(-mf16c)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_FMA)
|
if (LLAMA_FMA)
|
||||||
add_compile_options(-mfma)
|
add_compile_option_cpp(-mfma)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_AVX)
|
if (LLAMA_AVX)
|
||||||
add_compile_options(-mavx)
|
add_compile_option_cpp(-mavx)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_AVX2)
|
if (LLAMA_AVX2)
|
||||||
add_compile_options(-mavx2)
|
add_compile_option_cpp(-mavx2)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_AVX512)
|
if (LLAMA_AVX512)
|
||||||
add_compile_options(-mavx512f)
|
add_compile_option_cpp(-mavx512f)
|
||||||
add_compile_options(-mavx512bw)
|
add_compile_option_cpp(-mavx512bw)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_AVX512_VBMI)
|
if (LLAMA_AVX512_VBMI)
|
||||||
add_compile_options(-mavx512vbmi)
|
add_compile_option_cpp(-mavx512vbmi)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_AVX512_VNNI)
|
if (LLAMA_AVX512_VNNI)
|
||||||
add_compile_options(-mavx512vnni)
|
add_compile_option_cpp(-mavx512vnni)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||||
|
@ -686,7 +706,7 @@ endif()
|
||||||
|
|
||||||
if (MINGW)
|
if (MINGW)
|
||||||
# Target Windows 8 for PrefetchVirtualMemory
|
# Target Windows 8 for PrefetchVirtualMemory
|
||||||
add_compile_definitions(_WIN32_WINNT=0x602)
|
add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -838,7 +858,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
|
||||||
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
|
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
|
||||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
|
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
|
||||||
|
|
||||||
set(GGML_PUBLIC_HEADERS "ggml.h"
|
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
|
||||||
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
|
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
|
||||||
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
|
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
|
||||||
|
|
||||||
|
|
14
Makefile
|
@ -9,7 +9,7 @@ TEST_TARGETS = \
|
||||||
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
||||||
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
||||||
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
||||||
tests/test-backend-ops
|
tests/test-backend-ops tests/test-autorelease
|
||||||
|
|
||||||
# Code coverage output files
|
# Code coverage output files
|
||||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||||
|
@ -43,10 +43,6 @@ ifeq ($(UNAME_S),Darwin)
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
|
|
||||||
BUILD_TARGETS += metal
|
|
||||||
endif
|
|
||||||
|
|
||||||
default: $(BUILD_TARGETS)
|
default: $(BUILD_TARGETS)
|
||||||
|
|
||||||
test: $(TEST_TARGETS)
|
test: $(TEST_TARGETS)
|
||||||
|
@ -671,11 +667,6 @@ lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
ifdef LLAMA_METAL
|
|
||||||
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(UNAME_S),Darwin)
|
ifeq ($(UNAME_S),Darwin)
|
||||||
swift: examples/batched.swift
|
swift: examples/batched.swift
|
||||||
(cd examples/batched.swift; make build)
|
(cd examples/batched.swift; make build)
|
||||||
|
@ -756,3 +747,6 @@ tests/test-c.o: tests/test-c.c llama.h
|
||||||
|
|
||||||
tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
|
tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
|
@ -128,6 +128,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||||
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
|
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
|
||||||
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
|
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
|
||||||
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
|
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
|
||||||
|
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
|
||||||
|
|
||||||
**UI:**
|
**UI:**
|
||||||
|
|
||||||
|
|
|
@ -43,7 +43,7 @@ Example for llama model
|
||||||
# For llama7b and llama2 models
|
# For llama7b and llama2 models
|
||||||
python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf
|
python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf
|
||||||
# For mistral and mpt models
|
# For mistral and mpt models
|
||||||
python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf
|
python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/mpt-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf
|
||||||
```
|
```
|
||||||
|
|
||||||
## Quantize
|
## Quantize
|
||||||
|
|
21
ci/run.sh
|
@ -36,6 +36,10 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||||
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
|
||||||
|
fi
|
||||||
|
|
||||||
## helpers
|
## helpers
|
||||||
|
|
||||||
# download a file if it does not exist or if it is outdated
|
# download a file if it does not exist or if it is outdated
|
||||||
|
@ -160,7 +164,7 @@ function gg_run_open_llama_3b_v2 {
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert.py ${path_models}
|
python3 ../convert.py ${path_models}
|
||||||
|
@ -179,6 +183,8 @@ function gg_run_open_llama_3b_v2 {
|
||||||
|
|
||||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||||
|
|
||||||
|
./bin/test-autorelease ${model_f16}
|
||||||
|
|
||||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
|
@ -214,6 +220,8 @@ function gg_run_open_llama_3b_v2 {
|
||||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
|
@ -241,6 +249,8 @@ function gg_run_open_llama_3b_v2 {
|
||||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
|
||||||
|
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||||
|
|
||||||
# lora
|
# lora
|
||||||
function compare_ppl {
|
function compare_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -282,7 +292,6 @@ function gg_run_open_llama_3b_v2 {
|
||||||
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
|
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
|
||||||
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
|
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -292,6 +301,7 @@ function gg_sum_open_llama_3b_v2 {
|
||||||
gg_printf 'OpenLLaMA 3B-v2:\n'
|
gg_printf 'OpenLLaMA 3B-v2:\n'
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||||
|
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||||
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
|
@ -337,7 +347,7 @@ function gg_run_open_llama_7b_v2 {
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert.py ${path_models}
|
python3 ../convert.py ${path_models}
|
||||||
|
@ -391,6 +401,8 @@ function gg_run_open_llama_7b_v2 {
|
||||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
|
@ -418,6 +430,8 @@ function gg_run_open_llama_7b_v2 {
|
||||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
|
||||||
|
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||||
|
|
||||||
# lora
|
# lora
|
||||||
function compare_ppl {
|
function compare_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -469,6 +483,7 @@ function gg_sum_open_llama_7b_v2 {
|
||||||
gg_printf 'OpenLLaMA 7B-v2:\n'
|
gg_printf 'OpenLLaMA 7B-v2:\n'
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||||
|
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||||
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
|
|
|
@ -167,6 +167,24 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
if (params.n_threads_batch <= 0) {
|
if (params.n_threads_batch <= 0) {
|
||||||
params.n_threads_batch = std::thread::hardware_concurrency();
|
params.n_threads_batch = std::thread::hardware_concurrency();
|
||||||
}
|
}
|
||||||
|
} else if (arg == "-td" || arg == "--threads-draft") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.n_threads_draft = std::stoi(argv[i]);
|
||||||
|
if (params.n_threads_draft <= 0) {
|
||||||
|
params.n_threads_draft = std::thread::hardware_concurrency();
|
||||||
|
}
|
||||||
|
} else if (arg == "-tbd" || arg == "--threads-batch-draft") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.n_threads_batch_draft = std::stoi(argv[i]);
|
||||||
|
if (params.n_threads_batch_draft <= 0) {
|
||||||
|
params.n_threads_batch_draft = std::thread::hardware_concurrency();
|
||||||
|
}
|
||||||
} else if (arg == "-p" || arg == "--prompt") {
|
} else if (arg == "-p" || arg == "--prompt") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -663,6 +681,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.hellaswag_tasks = std::stoi(argv[i]);
|
params.hellaswag_tasks = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--winogrande") {
|
||||||
|
params.winogrande = true;
|
||||||
|
} else if (arg == "--winogrande-tasks") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.winogrande_tasks = std::stoi(argv[i]);
|
||||||
} else if (arg == "--ignore-eos") {
|
} else if (arg == "--ignore-eos") {
|
||||||
params.ignore_eos = true;
|
params.ignore_eos = true;
|
||||||
} else if (arg == "--no-penalize-nl") {
|
} else if (arg == "--no-penalize-nl") {
|
||||||
|
@ -845,6 +871,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
|
printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
|
||||||
printf(" -tb N, --threads-batch N\n");
|
printf(" -tb N, --threads-batch N\n");
|
||||||
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||||
|
printf(" -td N, --threads-draft N");
|
||||||
|
printf(" number of threads to use during generation (default: same as --threads)");
|
||||||
|
printf(" -tbd N, --threads-batch-draft N\n");
|
||||||
|
printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
|
||||||
printf(" -p PROMPT, --prompt PROMPT\n");
|
printf(" -p PROMPT, --prompt PROMPT\n");
|
||||||
printf(" prompt to start generation with (default: empty)\n");
|
printf(" prompt to start generation with (default: empty)\n");
|
||||||
printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
|
printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
|
||||||
|
@ -904,6 +934,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
||||||
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
||||||
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
||||||
|
printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n");
|
||||||
|
printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
|
||||||
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||||
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
|
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
|
||||||
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||||
|
|
|
@ -46,7 +46,9 @@ struct gpt_params {
|
||||||
uint32_t seed = -1; // RNG seed
|
uint32_t seed = -1; // RNG seed
|
||||||
|
|
||||||
int32_t n_threads = get_num_physical_cores();
|
int32_t n_threads = get_num_physical_cores();
|
||||||
|
int32_t n_threads_draft = -1;
|
||||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||||
|
int32_t n_threads_batch_draft = -1;
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 512; // context size
|
int32_t n_ctx = 512; // context size
|
||||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
|
@ -103,6 +105,9 @@ struct gpt_params {
|
||||||
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
||||||
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
||||||
|
|
||||||
|
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
|
||||||
|
size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
|
||||||
|
|
||||||
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
||||||
bool random_prompt = false; // do not randomize prompt if none provided
|
bool random_prompt = false; // do not randomize prompt if none provided
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
|
|
|
@ -190,6 +190,11 @@ static llama_token llama_sampling_sample_impl(
|
||||||
logits[it->first] += it->second;
|
logits[it->first] += it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ctx_cfg) {
|
||||||
|
float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
|
||||||
|
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
|
||||||
|
}
|
||||||
|
|
||||||
cur.clear();
|
cur.clear();
|
||||||
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
|
@ -198,10 +203,6 @@ static llama_token llama_sampling_sample_impl(
|
||||||
|
|
||||||
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||||
|
|
||||||
if (ctx_cfg) {
|
|
||||||
llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale);
|
|
||||||
}
|
|
||||||
|
|
||||||
// apply penalties
|
// apply penalties
|
||||||
const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
|
const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
|
||||||
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
|
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
|
||||||
|
|
|
@ -17,7 +17,7 @@ typedef struct llama_sampling_params {
|
||||||
float min_p = 0.05f; // 0.0 = disabled
|
float min_p = 0.05f; // 0.0 = disabled
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
float typical_p = 1.00f; // 1.0 = disabled
|
float typical_p = 1.00f; // 1.0 = disabled
|
||||||
float temp = 0.80f; // 1.0 = disabled
|
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||||
float penalty_repeat = 1.10f; // 1.0 = disabled
|
float penalty_repeat = 1.10f; // 1.0 = disabled
|
||||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||||
|
|
|
@ -189,6 +189,8 @@ class Model:
|
||||||
return StableLMModel
|
return StableLMModel
|
||||||
if model_architecture == "QWenLMHeadModel":
|
if model_architecture == "QWenLMHeadModel":
|
||||||
return QwenModel
|
return QwenModel
|
||||||
|
if model_architecture == "Qwen2ForCausalLM":
|
||||||
|
return Model
|
||||||
if model_architecture == "MixtralForCausalLM":
|
if model_architecture == "MixtralForCausalLM":
|
||||||
return MixtralModel
|
return MixtralModel
|
||||||
if model_architecture == "GPT2LMHeadModel":
|
if model_architecture == "GPT2LMHeadModel":
|
||||||
|
@ -197,6 +199,8 @@ class Model:
|
||||||
return Phi2Model
|
return Phi2Model
|
||||||
if model_architecture == "PlamoForCausalLM":
|
if model_architecture == "PlamoForCausalLM":
|
||||||
return PlamoModel
|
return PlamoModel
|
||||||
|
if model_architecture == "CodeShellForCausalLM":
|
||||||
|
return CodeShellModel
|
||||||
return Model
|
return Model
|
||||||
|
|
||||||
def _is_model_safetensors(self) -> bool:
|
def _is_model_safetensors(self) -> bool:
|
||||||
|
@ -234,6 +238,8 @@ class Model:
|
||||||
return gguf.MODEL_ARCH.STABLELM
|
return gguf.MODEL_ARCH.STABLELM
|
||||||
if arch == "QWenLMHeadModel":
|
if arch == "QWenLMHeadModel":
|
||||||
return gguf.MODEL_ARCH.QWEN
|
return gguf.MODEL_ARCH.QWEN
|
||||||
|
if arch == "Qwen2ForCausalLM":
|
||||||
|
return gguf.MODEL_ARCH.QWEN2
|
||||||
if arch == "MixtralForCausalLM":
|
if arch == "MixtralForCausalLM":
|
||||||
return gguf.MODEL_ARCH.LLAMA
|
return gguf.MODEL_ARCH.LLAMA
|
||||||
if arch == "GPT2LMHeadModel":
|
if arch == "GPT2LMHeadModel":
|
||||||
|
@ -242,6 +248,8 @@ class Model:
|
||||||
return gguf.MODEL_ARCH.PHI2
|
return gguf.MODEL_ARCH.PHI2
|
||||||
if arch == "PlamoForCausalLM":
|
if arch == "PlamoForCausalLM":
|
||||||
return gguf.MODEL_ARCH.PLAMO
|
return gguf.MODEL_ARCH.PLAMO
|
||||||
|
if arch == "CodeShellForCausalLM":
|
||||||
|
return gguf.MODEL_ARCH.CODESHELL
|
||||||
|
|
||||||
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
||||||
|
|
||||||
|
@ -266,7 +274,6 @@ class Model:
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
elif reverse_vocab[i] in added_vocab:
|
elif reverse_vocab[i] in added_vocab:
|
||||||
tokens.append(reverse_vocab[i])
|
tokens.append(reverse_vocab[i])
|
||||||
if hasattr(tokenizer, "added_tokens_decoder"):
|
|
||||||
if tokenizer.added_tokens_decoder[i].special:
|
if tokenizer.added_tokens_decoder[i].special:
|
||||||
toktypes.append(gguf.TokenType.CONTROL)
|
toktypes.append(gguf.TokenType.CONTROL)
|
||||||
else:
|
else:
|
||||||
|
@ -1177,6 +1184,70 @@ class PlamoModel(Model):
|
||||||
self.gguf_writer.add_tensor(new_name, data)
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
|
||||||
|
class CodeShellModel(Model):
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
block_count = self.hparams["n_layer"]
|
||||||
|
|
||||||
|
self.gguf_writer.add_name("CodeShell")
|
||||||
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
||||||
|
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
||||||
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
self.gguf_writer.add_head_count(self.hparams["n_head"])
|
||||||
|
self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
|
||||||
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
self.gguf_writer.add_rope_freq_base(10000.0)
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(1.0)
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
||||||
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||||
|
tensors = dict(self.get_tensors())
|
||||||
|
has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys()
|
||||||
|
for name, data_torch in tensors.items():
|
||||||
|
# we don't need these
|
||||||
|
if name.endswith((".attn.rotary_emb.inv_freq")):
|
||||||
|
continue
|
||||||
|
|
||||||
|
old_dtype = data_torch.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||||
|
data_torch = data_torch.to(torch.float32)
|
||||||
|
|
||||||
|
data = data_torch.squeeze().numpy()
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if self.ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
if not has_lm_head and name == "transformer.wte.weight":
|
||||||
|
self.gguf_writer.add_tensor("output.weight", data)
|
||||||
|
print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
|
|
22
convert.py
|
@ -348,7 +348,7 @@ class Params:
|
||||||
f_rope_freq_base = 1e6
|
f_rope_freq_base = 1e6
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab=config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
|
n_vocab=model["tok_embeddings.weight"].shape[0],
|
||||||
n_embd=config["dim"],
|
n_embd=config["dim"],
|
||||||
n_layer=config["n_layers"],
|
n_layer=config["n_layers"],
|
||||||
n_ctx=n_ctx,
|
n_ctx=n_ctx,
|
||||||
|
@ -387,6 +387,7 @@ class BpeVocab: # GPT
|
||||||
self.bpe_tokenizer = json.loads(
|
self.bpe_tokenizer = json.loads(
|
||||||
open(str(fname_tokenizer), encoding="utf-8").read()
|
open(str(fname_tokenizer), encoding="utf-8").read()
|
||||||
)
|
)
|
||||||
|
self.vocab = self.bpe_tokenizer["model"]["vocab"]
|
||||||
added_tokens: dict[str, int]
|
added_tokens: dict[str, int]
|
||||||
if fname_added_tokens is not None:
|
if fname_added_tokens is not None:
|
||||||
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
||||||
|
@ -405,7 +406,7 @@ class BpeVocab: # GPT
|
||||||
if item["content"] not in self.bpe_tokenizer
|
if item["content"] not in self.bpe_tokenizer
|
||||||
)
|
)
|
||||||
|
|
||||||
vocab_size: int = len(self.bpe_tokenizer)
|
vocab_size: int = len(self.vocab)
|
||||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||||
actual_ids = sorted(added_tokens.values())
|
actual_ids = sorted(added_tokens.values())
|
||||||
if expected_ids != actual_ids:
|
if expected_ids != actual_ids:
|
||||||
|
@ -415,6 +416,7 @@ class BpeVocab: # GPT
|
||||||
)
|
)
|
||||||
|
|
||||||
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
||||||
|
self.added_tokens_dict = added_tokens
|
||||||
self.added_tokens_list = [text for (text, idx) in items]
|
self.added_tokens_list = [text for (text, idx) in items]
|
||||||
self.vocab_size_base: int = vocab_size
|
self.vocab_size_base: int = vocab_size
|
||||||
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
|
@ -422,10 +424,9 @@ class BpeVocab: # GPT
|
||||||
self.fname_added_tokens = fname_added_tokens
|
self.fname_added_tokens = fname_added_tokens
|
||||||
|
|
||||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
tokenizer = self.bpe_tokenizer
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
|
|
||||||
|
|
||||||
for i, _ in enumerate(tokenizer):
|
for i, _ in enumerate(self.vocab):
|
||||||
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
|
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
|
||||||
|
|
||||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
@ -466,6 +467,7 @@ class SentencePieceVocab: # LlaMa
|
||||||
)
|
)
|
||||||
|
|
||||||
# Token pieces that were added to the base vocabulary.
|
# Token pieces that were added to the base vocabulary.
|
||||||
|
self.added_tokens_dict = added_tokens
|
||||||
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
||||||
self.vocab_size_base = vocab_size
|
self.vocab_size_base = vocab_size
|
||||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
|
@ -1006,6 +1008,7 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
|
||||||
)
|
)
|
||||||
for i in range(1, pad_count + 1):
|
for i in range(1, pad_count + 1):
|
||||||
vocab.added_tokens_dict[f"<dummy{i:05}>"] = -1
|
vocab.added_tokens_dict[f"<dummy{i:05}>"] = -1
|
||||||
|
vocab.added_tokens_list.append(f"<dummy{i:05}>")
|
||||||
vocab.vocab_size = params.n_vocab
|
vocab.vocab_size = params.n_vocab
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -1097,6 +1100,8 @@ class OutputFile:
|
||||||
scores.append(score)
|
scores.append(score)
|
||||||
toktypes.append(toktype)
|
toktypes.append(toktype)
|
||||||
|
|
||||||
|
assert len(tokens) == vocab.vocab_size
|
||||||
|
|
||||||
return tokens, scores, toktypes
|
return tokens, scores, toktypes
|
||||||
|
|
||||||
def add_meta_vocab(self, vocab: Vocab) -> None:
|
def add_meta_vocab(self, vocab: Vocab) -> None:
|
||||||
|
@ -1373,15 +1378,14 @@ class VocabFactory:
|
||||||
self.files[file] = file_path
|
self.files[file] = file_path
|
||||||
elif parent_file_path.exists():
|
elif parent_file_path.exists():
|
||||||
self.files[file] = parent_file_path
|
self.files[file] = parent_file_path
|
||||||
|
print(f"Found vocab files: {self.files}")
|
||||||
|
|
||||||
def _select_file(self, vocabtype: Optional[str]) -> Path:
|
def _select_file(self, vocabtype: Optional[str]) -> Path:
|
||||||
if vocabtype in ["spm", "bpe"]:
|
if vocabtype in ["spm", "bpe"]:
|
||||||
# For SentencePiece and BPE, return specific files as before
|
for file_key in self.files.keys():
|
||||||
file_key = "tokenizer.model" if vocabtype == "spm" else "vocab.json"
|
|
||||||
if self.files[file_key]:
|
if self.files[file_key]:
|
||||||
return self.files[file_key]
|
return self.files[file_key]
|
||||||
else:
|
raise FileNotFoundError(f"{vocabtype} vocab not found.")
|
||||||
raise FileNotFoundError(f"{vocabtype} {file_key} not found.")
|
|
||||||
elif vocabtype == "hfft":
|
elif vocabtype == "hfft":
|
||||||
# For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
|
# For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
|
||||||
return self.path
|
return self.path
|
||||||
|
|
|
@ -37,9 +37,6 @@ else()
|
||||||
add_subdirectory(lookup)
|
add_subdirectory(lookup)
|
||||||
add_subdirectory(train-text-from-scratch)
|
add_subdirectory(train-text-from-scratch)
|
||||||
add_subdirectory(imatrix)
|
add_subdirectory(imatrix)
|
||||||
if (LLAMA_METAL)
|
|
||||||
add_subdirectory(metal)
|
|
||||||
endif()
|
|
||||||
if (LLAMA_BUILD_SERVER)
|
if (LLAMA_BUILD_SERVER)
|
||||||
add_subdirectory(server)
|
add_subdirectory(server)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -194,7 +194,7 @@ int main(int argc, char ** argv) {
|
||||||
// Set up a the benchmark matrices
|
// Set up a the benchmark matrices
|
||||||
// printf("Creating new tensor q11 & Running quantize\n");
|
// printf("Creating new tensor q11 & Running quantize\n");
|
||||||
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
||||||
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());
|
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], hist_cur.data(), nullptr);
|
||||||
|
|
||||||
// Set up a the compute graph
|
// Set up a the compute graph
|
||||||
// printf("Creating new tensor q31\n");
|
// printf("Creating new tensor q31\n");
|
||||||
|
@ -207,7 +207,7 @@ int main(int argc, char ** argv) {
|
||||||
// Set up a second graph computation to make sure we override the CPU cache lines
|
// Set up a second graph computation to make sure we override the CPU cache lines
|
||||||
// printf("Creating new tensor q12 & Running quantize\n");
|
// printf("Creating new tensor q12 & Running quantize\n");
|
||||||
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
||||||
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());
|
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], hist_cur.data(), nullptr);
|
||||||
|
|
||||||
// printf("Creating new tensor q32\n");
|
// printf("Creating new tensor q32\n");
|
||||||
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
|
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
|
||||||
|
|
|
@ -1138,9 +1138,8 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor
|
||||||
return tn_buf.data();
|
return tn_buf.data();
|
||||||
};
|
};
|
||||||
|
|
||||||
uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
|
|
||||||
// write_magic
|
// write_magic
|
||||||
file.write_u32(LLAMA_FILE_MAGIC_LORA); // magic
|
file.write_u32(LLAMA_FILE_MAGIC_GGLA); // magic
|
||||||
file.write_u32(1); // version
|
file.write_u32(1); // version
|
||||||
// write_hparams
|
// write_hparams
|
||||||
file.write_u32(lora->hparams.lora_r);
|
file.write_u32(lora->hparams.lora_r);
|
||||||
|
@ -1800,7 +1799,7 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<llama_token> train_tokens;
|
std::vector<llama_token> train_tokens;
|
||||||
std::vector<size_t> train_samples_begin;
|
std::vector<size_t> train_samples_begin;
|
||||||
std::vector<size_t> train_samples_size;
|
std::vector<size_t> train_samples_size;
|
||||||
printf("%s: tokenize training data\n", __func__);
|
printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
|
||||||
tokenize_file(lctx,
|
tokenize_file(lctx,
|
||||||
params.common.fn_train_data,
|
params.common.fn_train_data,
|
||||||
params.common.sample_start,
|
params.common.sample_start,
|
||||||
|
|
32
examples/imatrix/README.md
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
# llama.cpp/examples/imatrix
|
||||||
|
|
||||||
|
Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models.
|
||||||
|
More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```
|
||||||
|
./imatrix -m <some_fp_model> -f <some_training_data> [-o <output_file>] [--verbosity <verbosity_level>]
|
||||||
|
[-ofreq num_chunks] [-ow <0 or 1>] [other common params]
|
||||||
|
```
|
||||||
|
|
||||||
|
Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
|
||||||
|
The parameters in square brackets are optional and have the following meaning:
|
||||||
|
* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
|
||||||
|
* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
|
||||||
|
* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
|
||||||
|
* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
|
||||||
|
|
||||||
|
For faster computation, make sure to use GPU offloading via the `-ngl` argument
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
```bash
|
||||||
|
LLAMA_CUBLAS=1 make -j
|
||||||
|
|
||||||
|
# generate importance matrix (imatrix.dat)
|
||||||
|
./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
||||||
|
|
||||||
|
# use the imatrix to perform a Q4_K_M quantization
|
||||||
|
./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
|
||||||
|
```
|
|
@ -33,19 +33,61 @@ class IMatrixCollector {
|
||||||
public:
|
public:
|
||||||
IMatrixCollector() = default;
|
IMatrixCollector() = default;
|
||||||
void set_parameters(StatParams&& params) { m_params = std::move(params); }
|
void set_parameters(StatParams&& params) { m_params = std::move(params); }
|
||||||
void collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
|
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
|
||||||
void save_imatrix() const;
|
void save_imatrix() const;
|
||||||
private:
|
private:
|
||||||
std::unordered_map<std::string, Stats> m_stats;
|
std::unordered_map<std::string, Stats> m_stats;
|
||||||
StatParams m_params;
|
StatParams m_params;
|
||||||
std::mutex m_mutex;
|
std::mutex m_mutex;
|
||||||
int m_last_call = 0;
|
int m_last_call = 0;
|
||||||
|
std::vector<float> m_src1_data;
|
||||||
|
std::vector<int> m_ids; // the expert ids from ggml_mul_mat_id
|
||||||
};
|
};
|
||||||
|
|
||||||
void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
|
bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||||
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return;
|
GGML_UNUSED(user_data);
|
||||||
if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return;
|
|
||||||
|
const struct ggml_tensor * src0 = t->src[0];
|
||||||
|
const struct ggml_tensor * src1 = t->src[1];
|
||||||
|
|
||||||
|
// when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
||||||
|
// if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
|
||||||
|
if (ask) {
|
||||||
|
if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
|
||||||
|
if (t->op != GGML_OP_MUL_MAT) return false;
|
||||||
|
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
|
||||||
|
if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
std::lock_guard<std::mutex> lock(m_mutex);
|
std::lock_guard<std::mutex> lock(m_mutex);
|
||||||
|
|
||||||
|
// copy the data from the GPU memory if needed
|
||||||
|
const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
|
||||||
|
|
||||||
|
if (!is_host) {
|
||||||
|
m_src1_data.resize(ggml_nelements(src1));
|
||||||
|
ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
|
||||||
|
}
|
||||||
|
|
||||||
|
const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
|
||||||
|
|
||||||
|
if (t->op == GGML_OP_MUL_MAT_ID) {
|
||||||
|
const int idx = ((int32_t *) t->op_params)[0];
|
||||||
|
const int n_as = ((int32_t *) t->op_params)[1];
|
||||||
|
|
||||||
|
// the top-k selected expert ids are stored in the src0 tensor
|
||||||
|
// for simplicity, always copy src0 to host, because it is small
|
||||||
|
// take into account that src0 is not contiguous!
|
||||||
|
GGML_ASSERT(src0->ne[1] == src1->ne[1]);
|
||||||
|
GGML_ASSERT(n_as*ggml_nrows(src0)*sizeof(int) == GGML_PAD(ggml_nbytes(src0), n_as*sizeof(int)));
|
||||||
|
m_ids.resize(ggml_nbytes(src0)/sizeof(int));
|
||||||
|
ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0));
|
||||||
|
|
||||||
|
// loop over all possible experts, regardless if they are used or not in the batch
|
||||||
|
// this is necessary to guarantee equal number of "ncall" for each tensor
|
||||||
|
for (int ex = 0; ex < n_as; ++ex) {
|
||||||
|
src0 = t->src[2 + ex];
|
||||||
auto& e = m_stats[src0->name];
|
auto& e = m_stats[src0->name];
|
||||||
if (e.values.empty()) {
|
if (e.values.empty()) {
|
||||||
e.values.resize(src1->ne[0], 0);
|
e.values.resize(src1->ne[0], 0);
|
||||||
|
@ -54,12 +96,18 @@ void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const st
|
||||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
|
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
|
||||||
exit(1); //GGML_ASSERT(false);
|
exit(1); //GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
|
||||||
|
// using the following line, we can correct for that if needed
|
||||||
|
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
|
||||||
++e.ncall;
|
++e.ncall;
|
||||||
if (m_params.verbosity > 1) {
|
if (m_params.verbosity > 1) {
|
||||||
printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type);
|
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||||
}
|
}
|
||||||
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||||
const float * x = (const float *)src1->data + row * src1->ne[0];
|
const int excur = m_ids[row*n_as + idx];
|
||||||
|
GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
|
||||||
|
if (excur != ex) continue;
|
||||||
|
const float * x = data + row * src1->ne[0];
|
||||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||||
e.values[j] += x[j]*x[j];
|
e.values[j] += x[j]*x[j];
|
||||||
}
|
}
|
||||||
|
@ -71,6 +119,35 @@ void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const st
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
auto& e = m_stats[src0->name];
|
||||||
|
if (e.values.empty()) {
|
||||||
|
e.values.resize(src1->ne[0], 0);
|
||||||
|
}
|
||||||
|
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||||
|
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
|
||||||
|
exit(1); //GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
++e.ncall;
|
||||||
|
if (m_params.verbosity > 1) {
|
||||||
|
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||||
|
}
|
||||||
|
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||||
|
const float * x = data + row * src1->ne[0];
|
||||||
|
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||||
|
e.values[j] += x[j]*x[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (e.ncall > m_last_call) {
|
||||||
|
m_last_call = e.ncall;
|
||||||
|
if (m_last_call % m_params.n_output_frequency == 0) {
|
||||||
|
save_imatrix();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
void IMatrixCollector::save_imatrix() const {
|
void IMatrixCollector::save_imatrix() const {
|
||||||
const char * fname = m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str();
|
const char * fname = m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str();
|
||||||
|
@ -93,8 +170,8 @@ void IMatrixCollector::save_imatrix() const {
|
||||||
|
|
||||||
static IMatrixCollector g_collector;
|
static IMatrixCollector g_collector;
|
||||||
|
|
||||||
static void ik_collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
|
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||||
g_collector.collect_imatrix(src0, src1);
|
return g_collector.collect_imatrix(t, ask, user_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -320,8 +397,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
g_collector.set_parameters(std::move(sparams));
|
g_collector.set_parameters(std::move(sparams));
|
||||||
|
|
||||||
ggml_set_imatrix_collection(ik_collect_imatrix);
|
|
||||||
|
|
||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||||
|
|
||||||
|
@ -340,16 +415,27 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_backend_init(params.numa);
|
llama_backend_init(params.numa);
|
||||||
|
|
||||||
llama_model * model;
|
llama_model_params mparams = llama_model_params_from_gpt_params(params);
|
||||||
llama_context * ctx;
|
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
||||||
|
|
||||||
|
// pass the callback to the backend scheduler
|
||||||
|
// it will be executed for each node during the graph computation
|
||||||
|
cparams.cb_eval = ik_collect_imatrix;
|
||||||
|
cparams.cb_eval_user_data = NULL;
|
||||||
|
|
||||||
|
llama_context * ctx = llama_new_context_with_model(model, cparams);
|
||||||
|
if (ctx == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: unable to create context\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
if (params.n_ctx > n_ctx_train) {
|
if (params.n_ctx > n_ctx_train) {
|
||||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||||
|
|
33
examples/llama.android/.gitignore
vendored
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
# Gradle files
|
||||||
|
.gradle/
|
||||||
|
build/
|
||||||
|
|
||||||
|
# Local configuration file (sdk path, etc)
|
||||||
|
local.properties
|
||||||
|
|
||||||
|
# Log/OS Files
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# Android Studio generated files and folders
|
||||||
|
captures/
|
||||||
|
.externalNativeBuild/
|
||||||
|
.cxx/
|
||||||
|
*.apk
|
||||||
|
output.json
|
||||||
|
|
||||||
|
# IntelliJ
|
||||||
|
*.iml
|
||||||
|
.idea/
|
||||||
|
misc.xml
|
||||||
|
deploymentTargetDropDown.xml
|
||||||
|
render.experimental.xml
|
||||||
|
|
||||||
|
# Keystore files
|
||||||
|
*.jks
|
||||||
|
*.keystore
|
||||||
|
|
||||||
|
# Google Services (e.g. APIs or Firebase)
|
||||||
|
google-services.json
|
||||||
|
|
||||||
|
# Android Profiling
|
||||||
|
*.hprof
|
0
examples/llama.android/README.md
Normal file
1
examples/llama.android/app/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
/build
|
91
examples/llama.android/app/build.gradle.kts
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
plugins {
|
||||||
|
id("com.android.application")
|
||||||
|
id("org.jetbrains.kotlin.android")
|
||||||
|
}
|
||||||
|
|
||||||
|
android {
|
||||||
|
namespace = "com.example.llama"
|
||||||
|
compileSdk = 34
|
||||||
|
|
||||||
|
ndkVersion = "26.1.10909125"
|
||||||
|
|
||||||
|
defaultConfig {
|
||||||
|
applicationId = "com.example.llama"
|
||||||
|
minSdk = 33
|
||||||
|
targetSdk = 34
|
||||||
|
versionCode = 1
|
||||||
|
versionName = "1.0"
|
||||||
|
|
||||||
|
testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
|
||||||
|
vectorDrawables {
|
||||||
|
useSupportLibrary = true
|
||||||
|
}
|
||||||
|
ndk {
|
||||||
|
// Workaround for https://github.com/llvm/llvm-project/issues/65820
|
||||||
|
// affecting armeabi-v7a. Skip armeabi-v7a when invoked with
|
||||||
|
// -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a).
|
||||||
|
if (project.hasProperty("skip-armeabi-v7a")) {
|
||||||
|
abiFilters += listOf("arm64-v8a", "x86_64", "x86")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
externalNativeBuild {
|
||||||
|
cmake {
|
||||||
|
cppFlags += listOf()
|
||||||
|
arguments += listOf()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
buildTypes {
|
||||||
|
release {
|
||||||
|
isMinifyEnabled = false
|
||||||
|
proguardFiles(
|
||||||
|
getDefaultProguardFile("proguard-android-optimize.txt"),
|
||||||
|
"proguard-rules.pro"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
compileOptions {
|
||||||
|
sourceCompatibility = JavaVersion.VERSION_1_8
|
||||||
|
targetCompatibility = JavaVersion.VERSION_1_8
|
||||||
|
}
|
||||||
|
kotlinOptions {
|
||||||
|
jvmTarget = "1.8"
|
||||||
|
}
|
||||||
|
buildFeatures {
|
||||||
|
compose = true
|
||||||
|
}
|
||||||
|
composeOptions {
|
||||||
|
kotlinCompilerExtensionVersion = "1.5.1"
|
||||||
|
}
|
||||||
|
packaging {
|
||||||
|
resources {
|
||||||
|
excludes += "/META-INF/{AL2.0,LGPL2.1}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
externalNativeBuild {
|
||||||
|
cmake {
|
||||||
|
path = file("src/main/cpp/CMakeLists.txt")
|
||||||
|
version = "3.22.1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
|
||||||
|
implementation("androidx.core:core-ktx:1.12.0")
|
||||||
|
implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2")
|
||||||
|
implementation("androidx.activity:activity-compose:1.8.2")
|
||||||
|
implementation(platform("androidx.compose:compose-bom:2023.08.00"))
|
||||||
|
implementation("androidx.compose.ui:ui")
|
||||||
|
implementation("androidx.compose.ui:ui-graphics")
|
||||||
|
implementation("androidx.compose.ui:ui-tooling-preview")
|
||||||
|
implementation("androidx.compose.material3:material3")
|
||||||
|
testImplementation("junit:junit:4.13.2")
|
||||||
|
androidTestImplementation("androidx.test.ext:junit:1.1.5")
|
||||||
|
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
|
||||||
|
androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00"))
|
||||||
|
androidTestImplementation("androidx.compose.ui:ui-test-junit4")
|
||||||
|
debugImplementation("androidx.compose.ui:ui-tooling")
|
||||||
|
debugImplementation("androidx.compose.ui:ui-test-manifest")
|
||||||
|
}
|
21
examples/llama.android/app/proguard-rules.pro
vendored
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
# Add project specific ProGuard rules here.
|
||||||
|
# You can control the set of applied configuration files using the
|
||||||
|
# proguardFiles setting in build.gradle.
|
||||||
|
#
|
||||||
|
# For more details, see
|
||||||
|
# http://developer.android.com/guide/developing/tools/proguard.html
|
||||||
|
|
||||||
|
# If your project uses WebView with JS, uncomment the following
|
||||||
|
# and specify the fully qualified class name to the JavaScript interface
|
||||||
|
# class:
|
||||||
|
#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
|
||||||
|
# public *;
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Uncomment this to preserve the line number information for
|
||||||
|
# debugging stack traces.
|
||||||
|
#-keepattributes SourceFile,LineNumberTable
|
||||||
|
|
||||||
|
# If you keep the line number information, uncomment this to
|
||||||
|
# hide the original source file name.
|
||||||
|
#-renamesourcefileattribute SourceFile
|
30
examples/llama.android/app/src/main/AndroidManifest.xml
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
|
||||||
|
xmlns:tools="http://schemas.android.com/tools">
|
||||||
|
|
||||||
|
<uses-permission android:name="android.permission.INTERNET" />
|
||||||
|
|
||||||
|
<application
|
||||||
|
android:allowBackup="true"
|
||||||
|
android:dataExtractionRules="@xml/data_extraction_rules"
|
||||||
|
android:fullBackupContent="@xml/backup_rules"
|
||||||
|
android:icon="@mipmap/ic_launcher"
|
||||||
|
android:label="@string/app_name"
|
||||||
|
android:roundIcon="@mipmap/ic_launcher_round"
|
||||||
|
android:supportsRtl="true"
|
||||||
|
android:theme="@style/Theme.LlamaAndroid"
|
||||||
|
>
|
||||||
|
|
||||||
|
<activity
|
||||||
|
android:name=".MainActivity"
|
||||||
|
android:exported="true"
|
||||||
|
android:theme="@style/Theme.LlamaAndroid">
|
||||||
|
<intent-filter>
|
||||||
|
<action android:name="android.intent.action.MAIN" />
|
||||||
|
|
||||||
|
<category android:name="android.intent.category.LAUNCHER" />
|
||||||
|
</intent-filter>
|
||||||
|
</activity>
|
||||||
|
</application>
|
||||||
|
|
||||||
|
</manifest>
|
50
examples/llama.android/app/src/main/cpp/CMakeLists.txt
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
|
||||||
|
# For more information about using CMake with Android Studio, read the
|
||||||
|
# documentation: https://d.android.com/studio/projects/add-native-code.html.
|
||||||
|
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
|
||||||
|
|
||||||
|
# Sets the minimum CMake version required for this project.
|
||||||
|
cmake_minimum_required(VERSION 3.22.1)
|
||||||
|
|
||||||
|
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
|
||||||
|
# Since this is the top level CMakeLists.txt, the project name is also accessible
|
||||||
|
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
|
||||||
|
# build script scope).
|
||||||
|
project("llama-android")
|
||||||
|
|
||||||
|
include(FetchContent)
|
||||||
|
FetchContent_Declare(
|
||||||
|
llama
|
||||||
|
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||||
|
GIT_TAG master
|
||||||
|
)
|
||||||
|
|
||||||
|
# Also provides "common"
|
||||||
|
FetchContent_MakeAvailable(llama)
|
||||||
|
|
||||||
|
# Creates and names a library, sets it as either STATIC
|
||||||
|
# or SHARED, and provides the relative paths to its source code.
|
||||||
|
# You can define multiple libraries, and CMake builds them for you.
|
||||||
|
# Gradle automatically packages shared libraries with your APK.
|
||||||
|
#
|
||||||
|
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
|
||||||
|
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
|
||||||
|
# is preferred for the same purpose.
|
||||||
|
#
|
||||||
|
# In order to load a library into your app from Java/Kotlin, you must call
|
||||||
|
# System.loadLibrary() and pass the name of the library defined here;
|
||||||
|
# for GameActivity/NativeActivity derived applications, the same library name must be
|
||||||
|
# used in the AndroidManifest.xml file.
|
||||||
|
add_library(${CMAKE_PROJECT_NAME} SHARED
|
||||||
|
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
||||||
|
llama-android.cpp)
|
||||||
|
|
||||||
|
# Specifies libraries CMake should link to your target library. You
|
||||||
|
# can link libraries from various origins, such as libraries defined in this
|
||||||
|
# build script, prebuilt third-party libraries, or Android system libraries.
|
||||||
|
target_link_libraries(${CMAKE_PROJECT_NAME}
|
||||||
|
# List libraries link to the target library
|
||||||
|
llama
|
||||||
|
common
|
||||||
|
android
|
||||||
|
log)
|
394
examples/llama.android/app/src/main/cpp/llama-android.cpp
Normal file
|
@ -0,0 +1,394 @@
|
||||||
|
#include <android/log.h>
|
||||||
|
#include <jni.h>
|
||||||
|
#include <iomanip>
|
||||||
|
#include <math.h>
|
||||||
|
#include <string>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include "llama.h"
|
||||||
|
#include "common/common.h"
|
||||||
|
|
||||||
|
// Write C++ code here.
|
||||||
|
//
|
||||||
|
// Do not forget to dynamically load the C++ library into your application.
|
||||||
|
//
|
||||||
|
// For instance,
|
||||||
|
//
|
||||||
|
// In MainActivity.java:
|
||||||
|
// static {
|
||||||
|
// System.loadLibrary("llama-android");
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// Or, in MainActivity.kt:
|
||||||
|
// companion object {
|
||||||
|
// init {
|
||||||
|
// System.loadLibrary("llama-android")
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
#define TAG "llama-android.cpp"
|
||||||
|
#define LOGi(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
|
||||||
|
#define LOGe(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__)
|
||||||
|
|
||||||
|
jclass la_int_var;
|
||||||
|
jmethodID la_int_var_value;
|
||||||
|
jmethodID la_int_var_inc;
|
||||||
|
|
||||||
|
static void log_callback(ggml_log_level level, const char * fmt, void * data) {
|
||||||
|
if (level == GGML_LOG_LEVEL_ERROR) __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data);
|
||||||
|
else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data);
|
||||||
|
else if (level == GGML_LOG_LEVEL_WARN) __android_log_print(ANDROID_LOG_WARN, TAG, fmt, data);
|
||||||
|
else __android_log_print(ANDROID_LOG_DEFAULT, TAG, fmt, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT jlong JNICALL
|
||||||
|
Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
|
||||||
|
llama_model_params model_params = llama_model_default_params();
|
||||||
|
|
||||||
|
auto path_to_model = env->GetStringUTFChars(filename, 0);
|
||||||
|
LOGi("Loading model from %s", path_to_model);
|
||||||
|
|
||||||
|
auto model = llama_load_model_from_file(path_to_model, model_params);
|
||||||
|
env->ReleaseStringUTFChars(filename, path_to_model);
|
||||||
|
|
||||||
|
if (!model) {
|
||||||
|
LOGe("load_model() failed");
|
||||||
|
env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), "load_model() failed");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return reinterpret_cast<jlong>(model);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT void JNICALL
|
||||||
|
Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
|
||||||
|
llama_free_model(reinterpret_cast<llama_model *>(model));
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT jlong JNICALL
|
||||||
|
Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
||||||
|
auto model = reinterpret_cast<llama_model *>(jmodel);
|
||||||
|
|
||||||
|
if (!model) {
|
||||||
|
LOGe("new_context(): model cannot be null");
|
||||||
|
env->ThrowNew(env->FindClass("java/lang/IllegalArgumentException"), "Model cannot be null");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int n_threads = std::max(1, std::min(8, (int) sysconf(_SC_NPROCESSORS_ONLN) - 2));
|
||||||
|
LOGi("Using %d threads", n_threads);
|
||||||
|
|
||||||
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
|
ctx_params.seed = 1234;
|
||||||
|
ctx_params.n_ctx = 2048;
|
||||||
|
ctx_params.n_threads = n_threads;
|
||||||
|
ctx_params.n_threads_batch = n_threads;
|
||||||
|
|
||||||
|
llama_context * context = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
|
if (!context) {
|
||||||
|
LOGe("llama_new_context_with_model() returned null)");
|
||||||
|
env->ThrowNew(env->FindClass("java/lang/IllegalStateException"),
|
||||||
|
"llama_new_context_with_model() returned null)");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return reinterpret_cast<jlong>(context);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT void JNICALL
|
||||||
|
Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
|
||||||
|
llama_free(reinterpret_cast<llama_context *>(context));
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT void JNICALL
|
||||||
|
Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
|
||||||
|
llama_backend_free();
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT void JNICALL
|
||||||
|
Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
|
||||||
|
llama_log_set(log_callback, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT jstring JNICALL
|
||||||
|
Java_com_example_llama_Llm_bench_1model(
|
||||||
|
JNIEnv *env,
|
||||||
|
jobject,
|
||||||
|
jlong context_pointer,
|
||||||
|
jlong model_pointer,
|
||||||
|
jlong batch_pointer,
|
||||||
|
jint pp,
|
||||||
|
jint tg,
|
||||||
|
jint pl,
|
||||||
|
jint nr
|
||||||
|
) {
|
||||||
|
auto pp_avg = 0.0;
|
||||||
|
auto tg_avg = 0.0;
|
||||||
|
auto pp_std = 0.0;
|
||||||
|
auto tg_std = 0.0;
|
||||||
|
|
||||||
|
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
||||||
|
const auto model = reinterpret_cast<llama_model *>(model_pointer);
|
||||||
|
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
||||||
|
|
||||||
|
const int n_ctx = llama_n_ctx(context);
|
||||||
|
|
||||||
|
LOGi("n_ctx = %d", n_ctx);
|
||||||
|
|
||||||
|
int i, j;
|
||||||
|
int nri;
|
||||||
|
for (nri = 0; nri < nr; nri++) {
|
||||||
|
LOGi("Benchmark prompt processing (pp)");
|
||||||
|
|
||||||
|
llama_batch_clear(*batch);
|
||||||
|
|
||||||
|
const int n_tokens = pp;
|
||||||
|
for (i = 0; i < n_tokens; i++) {
|
||||||
|
llama_batch_add(*batch, 0, i, { 0 }, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
batch->logits[batch->n_tokens - 1] = true;
|
||||||
|
llama_kv_cache_clear(context);
|
||||||
|
|
||||||
|
const auto t_pp_start = ggml_time_us();
|
||||||
|
if (llama_decode(context, *batch) != 0) {
|
||||||
|
LOGi("llama_decode() failed during prompt processing");
|
||||||
|
}
|
||||||
|
const auto t_pp_end = ggml_time_us();
|
||||||
|
|
||||||
|
// bench text generation
|
||||||
|
|
||||||
|
LOGi("Benchmark text generation (tg)");
|
||||||
|
|
||||||
|
llama_kv_cache_clear(context);
|
||||||
|
const auto t_tg_start = ggml_time_us();
|
||||||
|
for (i = 0; i < tg; i++) {
|
||||||
|
|
||||||
|
llama_batch_clear(*batch);
|
||||||
|
for (j = 0; j < pl; j++) {
|
||||||
|
llama_batch_add(*batch, 0, i, { j }, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
LOGi("llama_decode() text generation: %d", i);
|
||||||
|
if (llama_decode(context, *batch) != 0) {
|
||||||
|
LOGi("llama_decode() failed during text generation");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto t_tg_end = ggml_time_us();
|
||||||
|
|
||||||
|
llama_kv_cache_clear(context);
|
||||||
|
|
||||||
|
const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
|
||||||
|
const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
|
||||||
|
|
||||||
|
const auto speed_pp = double(pp) / t_pp;
|
||||||
|
const auto speed_tg = double(pl * tg) / t_tg;
|
||||||
|
|
||||||
|
pp_avg += speed_pp;
|
||||||
|
tg_avg += speed_tg;
|
||||||
|
|
||||||
|
pp_std += speed_pp * speed_pp;
|
||||||
|
tg_std += speed_tg * speed_tg;
|
||||||
|
|
||||||
|
LOGi("pp %f t/s, tg %f t/s", speed_pp, speed_tg);
|
||||||
|
}
|
||||||
|
|
||||||
|
pp_avg /= double(nr);
|
||||||
|
tg_avg /= double(nr);
|
||||||
|
|
||||||
|
if (nr > 1) {
|
||||||
|
pp_std = sqrt(pp_std / double(nr - 1) - pp_avg * pp_avg * double(nr) / double(nr - 1));
|
||||||
|
tg_std = sqrt(tg_std / double(nr - 1) - tg_avg * tg_avg * double(nr) / double(nr - 1));
|
||||||
|
} else {
|
||||||
|
pp_std = 0;
|
||||||
|
tg_std = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
char model_desc[128];
|
||||||
|
llama_model_desc(model, model_desc, sizeof(model_desc));
|
||||||
|
|
||||||
|
const auto model_size = double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0;
|
||||||
|
const auto model_n_params = double(llama_model_n_params(model)) / 1e9;
|
||||||
|
|
||||||
|
const auto backend = "(Android)"; // TODO: What should this be?
|
||||||
|
|
||||||
|
std::stringstream result;
|
||||||
|
result << std::setprecision(2);
|
||||||
|
result << "| model | size | params | backend | test | t/s |\n";
|
||||||
|
result << "| --- | --- | --- | --- | --- | --- |\n";
|
||||||
|
result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | pp " << pp << " | " << pp_avg << " ± " << pp_std << " |\n";
|
||||||
|
result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | tg " << tg << " | " << tg_avg << " ± " << tg_std << " |\n";
|
||||||
|
|
||||||
|
return env->NewStringUTF(result.str().c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT void JNICALL
|
||||||
|
Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||||
|
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT jlong JNICALL
|
||||||
|
Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
||||||
|
|
||||||
|
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
|
||||||
|
|
||||||
|
llama_batch *batch = new llama_batch {
|
||||||
|
0,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (embd) {
|
||||||
|
batch->embd = (float *) malloc(sizeof(float) * n_tokens * embd);
|
||||||
|
} else {
|
||||||
|
batch->token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
batch->pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
|
||||||
|
batch->n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
|
||||||
|
batch->seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
|
||||||
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
|
batch->seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
|
||||||
|
}
|
||||||
|
batch->logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
|
||||||
|
|
||||||
|
return reinterpret_cast<jlong>(batch);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT void JNICALL
|
||||||
|
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jboolean numa) {
|
||||||
|
llama_backend_init(numa);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT jstring JNICALL
|
||||||
|
Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
|
||||||
|
return env->NewStringUTF(llama_print_system_info());
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT jint JNICALL
|
||||||
|
Java_com_example_llama_Llm_completion_1init(
|
||||||
|
JNIEnv *env,
|
||||||
|
jobject,
|
||||||
|
jlong context_pointer,
|
||||||
|
jlong batch_pointer,
|
||||||
|
jstring jtext,
|
||||||
|
jint n_len
|
||||||
|
) {
|
||||||
|
|
||||||
|
const auto text = env->GetStringUTFChars(jtext, 0);
|
||||||
|
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
||||||
|
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
||||||
|
|
||||||
|
const auto tokens_list = llama_tokenize(context, text, 1);
|
||||||
|
|
||||||
|
auto n_ctx = llama_n_ctx(context);
|
||||||
|
auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
|
||||||
|
|
||||||
|
LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);
|
||||||
|
|
||||||
|
if (n_kv_req > n_ctx) {
|
||||||
|
LOGe("error: n_kv_req > n_ctx, the required KV cache size is not big enough");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto id : tokens_list) {
|
||||||
|
LOGi("%s", llama_token_to_piece(context, id).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_batch_clear(*batch);
|
||||||
|
|
||||||
|
// evaluate the initial prompt
|
||||||
|
for (auto i = 0; i < tokens_list.size(); i++) {
|
||||||
|
llama_batch_add(*batch, tokens_list[i], i, { 0 }, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// llama_decode will output logits only for the last token of the prompt
|
||||||
|
batch->logits[batch->n_tokens - 1] = true;
|
||||||
|
|
||||||
|
if (llama_decode(context, *batch) != 0) {
|
||||||
|
LOGe("llama_decode() failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
env->ReleaseStringUTFChars(jtext, text);
|
||||||
|
|
||||||
|
return batch->n_tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT jstring JNICALL
|
||||||
|
Java_com_example_llama_Llm_completion_1loop(
|
||||||
|
JNIEnv * env,
|
||||||
|
jobject,
|
||||||
|
jlong context_pointer,
|
||||||
|
jlong batch_pointer,
|
||||||
|
jint n_len,
|
||||||
|
jobject intvar_ncur
|
||||||
|
) {
|
||||||
|
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
||||||
|
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
||||||
|
const auto model = llama_get_model(context);
|
||||||
|
|
||||||
|
if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
|
||||||
|
if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
|
||||||
|
if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
|
||||||
|
|
||||||
|
auto n_vocab = llama_n_vocab(model);
|
||||||
|
auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
|
||||||
|
|
||||||
|
std::vector<llama_token_data> candidates;
|
||||||
|
candidates.reserve(n_vocab);
|
||||||
|
|
||||||
|
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
|
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||||
|
|
||||||
|
// sample the most likely token
|
||||||
|
const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
|
||||||
|
|
||||||
|
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
||||||
|
if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
|
||||||
|
return env->NewStringUTF("");
|
||||||
|
}
|
||||||
|
|
||||||
|
auto new_token_chars = llama_token_to_piece(context, new_token_id);
|
||||||
|
LOGi("new_token_chars: `%s`", new_token_chars.c_str());
|
||||||
|
auto new_token = env->NewStringUTF(new_token_chars.c_str());
|
||||||
|
|
||||||
|
llama_batch_clear(*batch);
|
||||||
|
llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
|
||||||
|
|
||||||
|
env->CallVoidMethod(intvar_ncur, la_int_var_inc);
|
||||||
|
|
||||||
|
if (llama_decode(context, *batch) != 0) {
|
||||||
|
LOGe("llama_decode() returned null");
|
||||||
|
}
|
||||||
|
|
||||||
|
return new_token;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT void JNICALL
|
||||||
|
Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
||||||
|
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
|
||||||
|
}
|
|
@ -0,0 +1,119 @@
|
||||||
|
package com.example.llama
|
||||||
|
|
||||||
|
import android.app.DownloadManager
|
||||||
|
import android.net.Uri
|
||||||
|
import android.util.Log
|
||||||
|
import androidx.compose.material3.Button
|
||||||
|
import androidx.compose.material3.Text
|
||||||
|
import androidx.compose.runtime.Composable
|
||||||
|
import androidx.compose.runtime.getValue
|
||||||
|
import androidx.compose.runtime.mutableDoubleStateOf
|
||||||
|
import androidx.compose.runtime.mutableStateOf
|
||||||
|
import androidx.compose.runtime.remember
|
||||||
|
import androidx.compose.runtime.rememberCoroutineScope
|
||||||
|
import androidx.compose.runtime.setValue
|
||||||
|
import androidx.core.database.getLongOrNull
|
||||||
|
import androidx.core.net.toUri
|
||||||
|
import kotlinx.coroutines.delay
|
||||||
|
import kotlinx.coroutines.launch
|
||||||
|
import java.io.File
|
||||||
|
|
||||||
|
data class Downloadable(val name: String, val source: Uri, val destination: File) {
|
||||||
|
companion object {
|
||||||
|
@JvmStatic
|
||||||
|
private val tag: String? = this::class.qualifiedName
|
||||||
|
|
||||||
|
sealed interface State
|
||||||
|
data object Ready: State
|
||||||
|
data class Downloading(val id: Long): State
|
||||||
|
data class Downloaded(val downloadable: Downloadable): State
|
||||||
|
data class Error(val message: String): State
|
||||||
|
|
||||||
|
@JvmStatic
|
||||||
|
@Composable
|
||||||
|
fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) {
|
||||||
|
var status: State by remember {
|
||||||
|
mutableStateOf(
|
||||||
|
if (item.destination.exists()) Downloaded(item)
|
||||||
|
else Ready
|
||||||
|
)
|
||||||
|
}
|
||||||
|
var progress by remember { mutableDoubleStateOf(0.0) }
|
||||||
|
|
||||||
|
val coroutineScope = rememberCoroutineScope()
|
||||||
|
|
||||||
|
suspend fun waitForDownload(result: Downloading, item: Downloadable): State {
|
||||||
|
while (true) {
|
||||||
|
val cursor = dm.query(DownloadManager.Query().setFilterById(result.id))
|
||||||
|
|
||||||
|
if (cursor == null) {
|
||||||
|
Log.e(tag, "dm.query() returned null")
|
||||||
|
return Error("dm.query() returned null")
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!cursor.moveToFirst() || cursor.count < 1) {
|
||||||
|
cursor.close()
|
||||||
|
Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?")
|
||||||
|
return Ready
|
||||||
|
}
|
||||||
|
|
||||||
|
val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR)
|
||||||
|
val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES)
|
||||||
|
val sofar = cursor.getLongOrNull(pix) ?: 0
|
||||||
|
val total = cursor.getLongOrNull(tix) ?: 1
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
if (sofar == total) {
|
||||||
|
return Downloaded(item)
|
||||||
|
}
|
||||||
|
|
||||||
|
progress = (sofar * 1.0) / total
|
||||||
|
|
||||||
|
delay(1000L)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fun onClick() {
|
||||||
|
when (val s = status) {
|
||||||
|
is Downloaded -> {
|
||||||
|
viewModel.load(item.destination.path)
|
||||||
|
}
|
||||||
|
|
||||||
|
is Downloading -> {
|
||||||
|
coroutineScope.launch {
|
||||||
|
status = waitForDownload(s, item)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
else -> {
|
||||||
|
item.destination.delete()
|
||||||
|
|
||||||
|
val request = DownloadManager.Request(item.source).apply {
|
||||||
|
setTitle("Downloading model")
|
||||||
|
setDescription("Downloading model: ${item.name}")
|
||||||
|
setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI)
|
||||||
|
setDestinationUri(item.destination.toUri())
|
||||||
|
}
|
||||||
|
|
||||||
|
viewModel.log("Saving ${item.name} to ${item.destination.path}")
|
||||||
|
Log.i(tag, "Saving ${item.name} to ${item.destination.path}")
|
||||||
|
|
||||||
|
val id = dm.enqueue(request)
|
||||||
|
status = Downloading(id)
|
||||||
|
onClick()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Button(onClick = { onClick() }, enabled = status !is Downloading) {
|
||||||
|
when (status) {
|
||||||
|
is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%")
|
||||||
|
is Downloaded -> Text("Load ${item.name}")
|
||||||
|
is Ready -> Text("Download ${item.name}")
|
||||||
|
is Error -> Text("Download ${item.name}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,172 @@
|
||||||
|
package com.example.llama
|
||||||
|
|
||||||
|
import android.util.Log
|
||||||
|
import kotlinx.coroutines.CoroutineDispatcher
|
||||||
|
import kotlinx.coroutines.asCoroutineDispatcher
|
||||||
|
import kotlinx.coroutines.flow.Flow
|
||||||
|
import kotlinx.coroutines.flow.flow
|
||||||
|
import kotlinx.coroutines.flow.flowOn
|
||||||
|
import kotlinx.coroutines.withContext
|
||||||
|
import java.util.concurrent.Executors
|
||||||
|
import kotlin.concurrent.thread
|
||||||
|
|
||||||
|
class Llm {
|
||||||
|
private val tag: String? = this::class.simpleName
|
||||||
|
|
||||||
|
private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
|
||||||
|
|
||||||
|
private val runLoop: CoroutineDispatcher = Executors.newSingleThreadExecutor {
|
||||||
|
thread(start = false, name = "Llm-RunLoop") {
|
||||||
|
Log.d(tag, "Dedicated thread for native code: ${Thread.currentThread().name}")
|
||||||
|
|
||||||
|
// No-op if called more than once.
|
||||||
|
System.loadLibrary("llama-android")
|
||||||
|
|
||||||
|
// Set llama log handler to Android
|
||||||
|
log_to_android()
|
||||||
|
backend_init(false)
|
||||||
|
|
||||||
|
Log.d(tag, system_info())
|
||||||
|
|
||||||
|
it.run()
|
||||||
|
}.apply {
|
||||||
|
uncaughtExceptionHandler = Thread.UncaughtExceptionHandler { _, exception: Throwable ->
|
||||||
|
Log.e(tag, "Unhandled exception", exception)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}.asCoroutineDispatcher()
|
||||||
|
|
||||||
|
private val nlen: Int = 64
|
||||||
|
|
||||||
|
private external fun log_to_android()
|
||||||
|
private external fun load_model(filename: String): Long
|
||||||
|
private external fun free_model(model: Long)
|
||||||
|
private external fun new_context(model: Long): Long
|
||||||
|
private external fun free_context(context: Long)
|
||||||
|
private external fun backend_init(numa: Boolean)
|
||||||
|
private external fun backend_free()
|
||||||
|
private external fun free_batch(batch: Long)
|
||||||
|
private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
|
||||||
|
private external fun bench_model(
|
||||||
|
context: Long,
|
||||||
|
model: Long,
|
||||||
|
batch: Long,
|
||||||
|
pp: Int,
|
||||||
|
tg: Int,
|
||||||
|
pl: Int,
|
||||||
|
nr: Int
|
||||||
|
): String
|
||||||
|
|
||||||
|
private external fun system_info(): String
|
||||||
|
|
||||||
|
private external fun completion_init(
|
||||||
|
context: Long,
|
||||||
|
batch: Long,
|
||||||
|
text: String,
|
||||||
|
nLen: Int
|
||||||
|
): Int
|
||||||
|
|
||||||
|
private external fun completion_loop(
|
||||||
|
context: Long,
|
||||||
|
batch: Long,
|
||||||
|
nLen: Int,
|
||||||
|
ncur: IntVar
|
||||||
|
): String
|
||||||
|
|
||||||
|
private external fun kv_cache_clear(context: Long)
|
||||||
|
|
||||||
|
suspend fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1): String {
|
||||||
|
return withContext(runLoop) {
|
||||||
|
when (val state = threadLocalState.get()) {
|
||||||
|
is State.Loaded -> {
|
||||||
|
Log.d(tag, "bench(): $state")
|
||||||
|
bench_model(state.context, state.model, state.batch, pp, tg, pl, nr)
|
||||||
|
}
|
||||||
|
|
||||||
|
else -> throw IllegalStateException("No model loaded")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
suspend fun load(pathToModel: String) {
|
||||||
|
withContext(runLoop) {
|
||||||
|
when (threadLocalState.get()) {
|
||||||
|
is State.Idle -> {
|
||||||
|
val model = load_model(pathToModel)
|
||||||
|
if (model == 0L) throw IllegalStateException("load_model() failed")
|
||||||
|
|
||||||
|
val context = new_context(model)
|
||||||
|
if (context == 0L) throw IllegalStateException("new_context() failed")
|
||||||
|
|
||||||
|
val batch = new_batch(512, 0, 1)
|
||||||
|
if (batch == 0L) throw IllegalStateException("new_batch() failed")
|
||||||
|
|
||||||
|
Log.i(tag, "Loaded model $pathToModel")
|
||||||
|
threadLocalState.set(State.Loaded(model, context, batch))
|
||||||
|
}
|
||||||
|
else -> throw IllegalStateException("Model already loaded")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fun send(message: String): Flow<String> = flow {
|
||||||
|
when (val state = threadLocalState.get()) {
|
||||||
|
is State.Loaded -> {
|
||||||
|
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
|
||||||
|
while (ncur.value <= nlen) {
|
||||||
|
val str = completion_loop(state.context, state.batch, nlen, ncur)
|
||||||
|
if (str.isEmpty()) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
emit(str)
|
||||||
|
}
|
||||||
|
kv_cache_clear(state.context)
|
||||||
|
}
|
||||||
|
else -> {}
|
||||||
|
}
|
||||||
|
}.flowOn(runLoop)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Unloads the model and frees resources.
|
||||||
|
*
|
||||||
|
* This is a no-op if there's no model loaded.
|
||||||
|
*/
|
||||||
|
suspend fun unload() {
|
||||||
|
withContext(runLoop) {
|
||||||
|
when (val state = threadLocalState.get()) {
|
||||||
|
is State.Loaded -> {
|
||||||
|
free_context(state.context)
|
||||||
|
free_model(state.model)
|
||||||
|
free_batch(state.batch)
|
||||||
|
|
||||||
|
threadLocalState.set(State.Idle)
|
||||||
|
}
|
||||||
|
else -> {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
private class IntVar(value: Int) {
|
||||||
|
@Volatile
|
||||||
|
var value: Int = value
|
||||||
|
private set
|
||||||
|
|
||||||
|
fun inc() {
|
||||||
|
synchronized(this) {
|
||||||
|
value += 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private sealed interface State {
|
||||||
|
data object Idle: State
|
||||||
|
data class Loaded(val model: Long, val context: Long, val batch: Long): State
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enforce only one instance of Llm.
|
||||||
|
private val _instance: Llm = Llm()
|
||||||
|
|
||||||
|
fun instance(): Llm = _instance
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,154 @@
|
||||||
|
package com.example.llama
|
||||||
|
|
||||||
|
import android.app.ActivityManager
|
||||||
|
import android.app.DownloadManager
|
||||||
|
import android.content.ClipData
|
||||||
|
import android.content.ClipboardManager
|
||||||
|
import android.net.Uri
|
||||||
|
import android.os.Bundle
|
||||||
|
import android.os.StrictMode
|
||||||
|
import android.os.StrictMode.VmPolicy
|
||||||
|
import android.text.format.Formatter
|
||||||
|
import androidx.activity.ComponentActivity
|
||||||
|
import androidx.activity.compose.setContent
|
||||||
|
import androidx.activity.viewModels
|
||||||
|
import androidx.compose.foundation.layout.Box
|
||||||
|
import androidx.compose.foundation.layout.Column
|
||||||
|
import androidx.compose.foundation.layout.Row
|
||||||
|
import androidx.compose.foundation.layout.fillMaxSize
|
||||||
|
import androidx.compose.foundation.layout.padding
|
||||||
|
import androidx.compose.foundation.lazy.LazyColumn
|
||||||
|
import androidx.compose.foundation.lazy.items
|
||||||
|
import androidx.compose.foundation.lazy.rememberLazyListState
|
||||||
|
import androidx.compose.material3.Button
|
||||||
|
import androidx.compose.material3.LocalContentColor
|
||||||
|
import androidx.compose.material3.MaterialTheme
|
||||||
|
import androidx.compose.material3.OutlinedTextField
|
||||||
|
import androidx.compose.material3.Surface
|
||||||
|
import androidx.compose.material3.Text
|
||||||
|
import androidx.compose.runtime.Composable
|
||||||
|
import androidx.compose.ui.Modifier
|
||||||
|
import androidx.compose.ui.unit.dp
|
||||||
|
import androidx.core.content.getSystemService
|
||||||
|
import com.example.llama.ui.theme.LlamaAndroidTheme
|
||||||
|
import java.io.File
|
||||||
|
|
||||||
|
class MainActivity(
|
||||||
|
activityManager: ActivityManager? = null,
|
||||||
|
downloadManager: DownloadManager? = null,
|
||||||
|
clipboardManager: ClipboardManager? = null,
|
||||||
|
): ComponentActivity() {
|
||||||
|
private val tag: String? = this::class.simpleName
|
||||||
|
|
||||||
|
private val activityManager by lazy { activityManager ?: getSystemService<ActivityManager>()!! }
|
||||||
|
private val downloadManager by lazy { downloadManager ?: getSystemService<DownloadManager>()!! }
|
||||||
|
private val clipboardManager by lazy { clipboardManager ?: getSystemService<ClipboardManager>()!! }
|
||||||
|
|
||||||
|
private val viewModel: MainViewModel by viewModels()
|
||||||
|
|
||||||
|
// Get a MemoryInfo object for the device's current memory status.
|
||||||
|
private fun availableMemory(): ActivityManager.MemoryInfo {
|
||||||
|
return ActivityManager.MemoryInfo().also { memoryInfo ->
|
||||||
|
activityManager.getMemoryInfo(memoryInfo)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
override fun onCreate(savedInstanceState: Bundle?) {
|
||||||
|
super.onCreate(savedInstanceState)
|
||||||
|
|
||||||
|
StrictMode.setVmPolicy(
|
||||||
|
VmPolicy.Builder(StrictMode.getVmPolicy())
|
||||||
|
.detectLeakedClosableObjects()
|
||||||
|
.build()
|
||||||
|
)
|
||||||
|
|
||||||
|
val free = Formatter.formatFileSize(this, availableMemory().availMem)
|
||||||
|
val total = Formatter.formatFileSize(this, availableMemory().totalMem)
|
||||||
|
|
||||||
|
viewModel.log("Current memory: $free / $total")
|
||||||
|
viewModel.log("Downloads directory: ${getExternalFilesDir(null)}")
|
||||||
|
|
||||||
|
val extFilesDir = getExternalFilesDir(null)
|
||||||
|
|
||||||
|
val models = listOf(
|
||||||
|
Downloadable(
|
||||||
|
"Phi-2 7B (Q4_0, 1.6 GiB)",
|
||||||
|
Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"),
|
||||||
|
File(extFilesDir, "phi-2-q4_0.gguf"),
|
||||||
|
),
|
||||||
|
Downloadable(
|
||||||
|
"TinyLlama 1.1B (f16, 2.2 GiB)",
|
||||||
|
Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
|
||||||
|
File(extFilesDir, "tinyllama-1.1-f16.gguf"),
|
||||||
|
),
|
||||||
|
Downloadable(
|
||||||
|
"Phi 2 DPO (Q3_K_M, 1.48 GiB)",
|
||||||
|
Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"),
|
||||||
|
File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf")
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
setContent {
|
||||||
|
LlamaAndroidTheme {
|
||||||
|
// A surface container using the 'background' color from the theme
|
||||||
|
Surface(
|
||||||
|
modifier = Modifier.fillMaxSize(),
|
||||||
|
color = MaterialTheme.colorScheme.background
|
||||||
|
) {
|
||||||
|
MainCompose(
|
||||||
|
viewModel,
|
||||||
|
clipboardManager,
|
||||||
|
downloadManager,
|
||||||
|
models,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Composable
|
||||||
|
fun MainCompose(
|
||||||
|
viewModel: MainViewModel,
|
||||||
|
clipboard: ClipboardManager,
|
||||||
|
dm: DownloadManager,
|
||||||
|
models: List<Downloadable>
|
||||||
|
) {
|
||||||
|
Column {
|
||||||
|
val scrollState = rememberLazyListState()
|
||||||
|
|
||||||
|
Box(modifier = Modifier.weight(1f)) {
|
||||||
|
LazyColumn(state = scrollState) {
|
||||||
|
items(viewModel.messages) {
|
||||||
|
Text(
|
||||||
|
it,
|
||||||
|
style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current),
|
||||||
|
modifier = Modifier.padding(16.dp)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
OutlinedTextField(
|
||||||
|
value = viewModel.message,
|
||||||
|
onValueChange = { viewModel.updateMessage(it) },
|
||||||
|
label = { Text("Message") },
|
||||||
|
)
|
||||||
|
Row {
|
||||||
|
Button({ viewModel.send() }) { Text("Send") }
|
||||||
|
Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") }
|
||||||
|
Button({ viewModel.clear() }) { Text("Clear") }
|
||||||
|
Button({
|
||||||
|
viewModel.messages.joinToString("\n").let {
|
||||||
|
clipboard.setPrimaryClip(ClipData.newPlainText("", it))
|
||||||
|
}
|
||||||
|
}) { Text("Copy") }
|
||||||
|
}
|
||||||
|
|
||||||
|
Column {
|
||||||
|
for (model in models) {
|
||||||
|
Downloadable.Button(viewModel, dm, model)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,104 @@
|
||||||
|
package com.example.llama
|
||||||
|
|
||||||
|
import android.util.Log
|
||||||
|
import androidx.compose.runtime.getValue
|
||||||
|
import androidx.compose.runtime.mutableStateOf
|
||||||
|
import androidx.compose.runtime.setValue
|
||||||
|
import androidx.lifecycle.ViewModel
|
||||||
|
import androidx.lifecycle.viewModelScope
|
||||||
|
import kotlinx.coroutines.flow.catch
|
||||||
|
import kotlinx.coroutines.launch
|
||||||
|
|
||||||
|
class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||||
|
companion object {
|
||||||
|
@JvmStatic
|
||||||
|
private val NanosPerSecond = 1_000_000_000.0
|
||||||
|
}
|
||||||
|
|
||||||
|
private val tag: String? = this::class.simpleName
|
||||||
|
|
||||||
|
var messages by mutableStateOf(listOf("Initializing..."))
|
||||||
|
private set
|
||||||
|
|
||||||
|
var message by mutableStateOf("")
|
||||||
|
private set
|
||||||
|
|
||||||
|
override fun onCleared() {
|
||||||
|
super.onCleared()
|
||||||
|
|
||||||
|
viewModelScope.launch {
|
||||||
|
try {
|
||||||
|
llm.unload()
|
||||||
|
} catch (exc: IllegalStateException) {
|
||||||
|
messages += exc.message!!
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fun send() {
|
||||||
|
val text = message
|
||||||
|
message = ""
|
||||||
|
|
||||||
|
// Add to messages console.
|
||||||
|
messages += text
|
||||||
|
messages += ""
|
||||||
|
|
||||||
|
viewModelScope.launch {
|
||||||
|
llm.send(text)
|
||||||
|
.catch {
|
||||||
|
Log.e(tag, "send() failed", it)
|
||||||
|
messages += it.message!!
|
||||||
|
}
|
||||||
|
.collect { messages = messages.dropLast(1) + (messages.last() + it) }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) {
|
||||||
|
viewModelScope.launch {
|
||||||
|
try {
|
||||||
|
val start = System.nanoTime()
|
||||||
|
val warmupResult = llm.bench(pp, tg, pl, nr)
|
||||||
|
val end = System.nanoTime()
|
||||||
|
|
||||||
|
messages += warmupResult
|
||||||
|
|
||||||
|
val warmup = (end - start).toDouble() / NanosPerSecond
|
||||||
|
messages += "Warm up time: $warmup seconds, please wait..."
|
||||||
|
|
||||||
|
if (warmup > 5.0) {
|
||||||
|
messages += "Warm up took too long, aborting benchmark"
|
||||||
|
return@launch
|
||||||
|
}
|
||||||
|
|
||||||
|
messages += llm.bench(512, 128, 1, 3)
|
||||||
|
} catch (exc: IllegalStateException) {
|
||||||
|
Log.e(tag, "bench() failed", exc)
|
||||||
|
messages += exc.message!!
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fun load(pathToModel: String) {
|
||||||
|
viewModelScope.launch {
|
||||||
|
try {
|
||||||
|
llm.load(pathToModel)
|
||||||
|
messages += "Loaded $pathToModel"
|
||||||
|
} catch (exc: IllegalStateException) {
|
||||||
|
Log.e(tag, "load() failed", exc)
|
||||||
|
messages += exc.message!!
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fun updateMessage(newMessage: String) {
|
||||||
|
message = newMessage
|
||||||
|
}
|
||||||
|
|
||||||
|
fun clear() {
|
||||||
|
messages = listOf()
|
||||||
|
}
|
||||||
|
|
||||||
|
fun log(message: String) {
|
||||||
|
messages += message
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,11 @@
|
||||||
|
package com.example.llama.ui.theme
|
||||||
|
|
||||||
|
import androidx.compose.ui.graphics.Color
|
||||||
|
|
||||||
|
val Purple80 = Color(0xFFD0BCFF)
|
||||||
|
val PurpleGrey80 = Color(0xFFCCC2DC)
|
||||||
|
val Pink80 = Color(0xFFEFB8C8)
|
||||||
|
|
||||||
|
val Purple40 = Color(0xFF6650a4)
|
||||||
|
val PurpleGrey40 = Color(0xFF625b71)
|
||||||
|
val Pink40 = Color(0xFF7D5260)
|
|
@ -0,0 +1,70 @@
|
||||||
|
package com.example.llama.ui.theme
|
||||||
|
|
||||||
|
import android.app.Activity
|
||||||
|
import android.os.Build
|
||||||
|
import androidx.compose.foundation.isSystemInDarkTheme
|
||||||
|
import androidx.compose.material3.MaterialTheme
|
||||||
|
import androidx.compose.material3.darkColorScheme
|
||||||
|
import androidx.compose.material3.dynamicDarkColorScheme
|
||||||
|
import androidx.compose.material3.dynamicLightColorScheme
|
||||||
|
import androidx.compose.material3.lightColorScheme
|
||||||
|
import androidx.compose.runtime.Composable
|
||||||
|
import androidx.compose.runtime.SideEffect
|
||||||
|
import androidx.compose.ui.graphics.toArgb
|
||||||
|
import androidx.compose.ui.platform.LocalContext
|
||||||
|
import androidx.compose.ui.platform.LocalView
|
||||||
|
import androidx.core.view.WindowCompat
|
||||||
|
|
||||||
|
private val DarkColorScheme = darkColorScheme(
|
||||||
|
primary = Purple80,
|
||||||
|
secondary = PurpleGrey80,
|
||||||
|
tertiary = Pink80
|
||||||
|
)
|
||||||
|
|
||||||
|
private val LightColorScheme = lightColorScheme(
|
||||||
|
primary = Purple40,
|
||||||
|
secondary = PurpleGrey40,
|
||||||
|
tertiary = Pink40
|
||||||
|
|
||||||
|
/* Other default colors to override
|
||||||
|
background = Color(0xFFFFFBFE),
|
||||||
|
surface = Color(0xFFFFFBFE),
|
||||||
|
onPrimary = Color.White,
|
||||||
|
onSecondary = Color.White,
|
||||||
|
onTertiary = Color.White,
|
||||||
|
onBackground = Color(0xFF1C1B1F),
|
||||||
|
onSurface = Color(0xFF1C1B1F),
|
||||||
|
*/
|
||||||
|
)
|
||||||
|
|
||||||
|
@Composable
|
||||||
|
fun LlamaAndroidTheme(
|
||||||
|
darkTheme: Boolean = isSystemInDarkTheme(),
|
||||||
|
// Dynamic color is available on Android 12+
|
||||||
|
dynamicColor: Boolean = true,
|
||||||
|
content: @Composable () -> Unit
|
||||||
|
) {
|
||||||
|
val colorScheme = when {
|
||||||
|
dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> {
|
||||||
|
val context = LocalContext.current
|
||||||
|
if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context)
|
||||||
|
}
|
||||||
|
|
||||||
|
darkTheme -> DarkColorScheme
|
||||||
|
else -> LightColorScheme
|
||||||
|
}
|
||||||
|
val view = LocalView.current
|
||||||
|
if (!view.isInEditMode) {
|
||||||
|
SideEffect {
|
||||||
|
val window = (view.context as Activity).window
|
||||||
|
window.statusBarColor = colorScheme.primary.toArgb()
|
||||||
|
WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MaterialTheme(
|
||||||
|
colorScheme = colorScheme,
|
||||||
|
typography = Typography,
|
||||||
|
content = content
|
||||||
|
)
|
||||||
|
}
|
|
@ -0,0 +1,34 @@
|
||||||
|
package com.example.llama.ui.theme
|
||||||
|
|
||||||
|
import androidx.compose.material3.Typography
|
||||||
|
import androidx.compose.ui.text.TextStyle
|
||||||
|
import androidx.compose.ui.text.font.FontFamily
|
||||||
|
import androidx.compose.ui.text.font.FontWeight
|
||||||
|
import androidx.compose.ui.unit.sp
|
||||||
|
|
||||||
|
// Set of Material typography styles to start with
|
||||||
|
val Typography = Typography(
|
||||||
|
bodyLarge = TextStyle(
|
||||||
|
fontFamily = FontFamily.Default,
|
||||||
|
fontWeight = FontWeight.Normal,
|
||||||
|
fontSize = 16.sp,
|
||||||
|
lineHeight = 24.sp,
|
||||||
|
letterSpacing = 0.5.sp
|
||||||
|
)
|
||||||
|
/* Other default text styles to override
|
||||||
|
titleLarge = TextStyle(
|
||||||
|
fontFamily = FontFamily.Default,
|
||||||
|
fontWeight = FontWeight.Normal,
|
||||||
|
fontSize = 22.sp,
|
||||||
|
lineHeight = 28.sp,
|
||||||
|
letterSpacing = 0.sp
|
||||||
|
),
|
||||||
|
labelSmall = TextStyle(
|
||||||
|
fontFamily = FontFamily.Default,
|
||||||
|
fontWeight = FontWeight.Medium,
|
||||||
|
fontSize = 11.sp,
|
||||||
|
lineHeight = 16.sp,
|
||||||
|
letterSpacing = 0.5.sp
|
||||||
|
)
|
||||||
|
*/
|
||||||
|
)
|
|
@ -0,0 +1,170 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<vector xmlns:android="http://schemas.android.com/apk/res/android"
|
||||||
|
android:width="108dp"
|
||||||
|
android:height="108dp"
|
||||||
|
android:viewportWidth="108"
|
||||||
|
android:viewportHeight="108">
|
||||||
|
<path
|
||||||
|
android:fillColor="#3DDC84"
|
||||||
|
android:pathData="M0,0h108v108h-108z" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M9,0L9,108"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M19,0L19,108"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M29,0L29,108"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M39,0L39,108"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M49,0L49,108"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M59,0L59,108"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M69,0L69,108"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M79,0L79,108"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M89,0L89,108"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M99,0L99,108"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M0,9L108,9"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M0,19L108,19"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M0,29L108,29"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M0,39L108,39"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M0,49L108,49"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M0,59L108,59"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M0,69L108,69"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M0,79L108,79"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M0,89L108,89"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M0,99L108,99"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M19,29L89,29"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M19,39L89,39"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M19,49L89,49"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M19,59L89,59"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M19,69L89,69"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M19,79L89,79"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M29,19L29,89"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M39,19L39,89"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M49,19L49,89"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M59,19L59,89"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M69,19L69,89"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
<path
|
||||||
|
android:fillColor="#00000000"
|
||||||
|
android:pathData="M79,19L79,89"
|
||||||
|
android:strokeWidth="0.8"
|
||||||
|
android:strokeColor="#33FFFFFF" />
|
||||||
|
</vector>
|
|
@ -0,0 +1,30 @@
|
||||||
|
<vector xmlns:android="http://schemas.android.com/apk/res/android"
|
||||||
|
xmlns:aapt="http://schemas.android.com/aapt"
|
||||||
|
android:width="108dp"
|
||||||
|
android:height="108dp"
|
||||||
|
android:viewportWidth="108"
|
||||||
|
android:viewportHeight="108">
|
||||||
|
<path android:pathData="M31,63.928c0,0 6.4,-11 12.1,-13.1c7.2,-2.6 26,-1.4 26,-1.4l38.1,38.1L107,108.928l-32,-1L31,63.928z">
|
||||||
|
<aapt:attr name="android:fillColor">
|
||||||
|
<gradient
|
||||||
|
android:endX="85.84757"
|
||||||
|
android:endY="92.4963"
|
||||||
|
android:startX="42.9492"
|
||||||
|
android:startY="49.59793"
|
||||||
|
android:type="linear">
|
||||||
|
<item
|
||||||
|
android:color="#44000000"
|
||||||
|
android:offset="0.0" />
|
||||||
|
<item
|
||||||
|
android:color="#00000000"
|
||||||
|
android:offset="1.0" />
|
||||||
|
</gradient>
|
||||||
|
</aapt:attr>
|
||||||
|
</path>
|
||||||
|
<path
|
||||||
|
android:fillColor="#FFFFFF"
|
||||||
|
android:fillType="nonZero"
|
||||||
|
android:pathData="M65.3,45.828l3.8,-6.6c0.2,-0.4 0.1,-0.9 -0.3,-1.1c-0.4,-0.2 -0.9,-0.1 -1.1,0.3l-3.9,6.7c-6.3,-2.8 -13.4,-2.8 -19.7,0l-3.9,-6.7c-0.2,-0.4 -0.7,-0.5 -1.1,-0.3C38.8,38.328 38.7,38.828 38.9,39.228l3.8,6.6C36.2,49.428 31.7,56.028 31,63.928h46C76.3,56.028 71.8,49.428 65.3,45.828zM43.4,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2c-0.3,-0.7 -0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C45.3,56.528 44.5,57.328 43.4,57.328L43.4,57.328zM64.6,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2s-0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C66.5,56.528 65.6,57.328 64.6,57.328L64.6,57.328z"
|
||||||
|
android:strokeWidth="1"
|
||||||
|
android:strokeColor="#00000000" />
|
||||||
|
</vector>
|
|
@ -0,0 +1,6 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
|
||||||
|
<background android:drawable="@drawable/ic_launcher_background" />
|
||||||
|
<foreground android:drawable="@drawable/ic_launcher_foreground" />
|
||||||
|
<monochrome android:drawable="@drawable/ic_launcher_foreground" />
|
||||||
|
</adaptive-icon>
|
|
@ -0,0 +1,6 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
|
||||||
|
<background android:drawable="@drawable/ic_launcher_background" />
|
||||||
|
<foreground android:drawable="@drawable/ic_launcher_foreground" />
|
||||||
|
<monochrome android:drawable="@drawable/ic_launcher_foreground" />
|
||||||
|
</adaptive-icon>
|
After Width: | Height: | Size: 1.4 KiB |
After Width: | Height: | Size: 2.8 KiB |
After Width: | Height: | Size: 982 B |
After Width: | Height: | Size: 1.7 KiB |
After Width: | Height: | Size: 1.9 KiB |
After Width: | Height: | Size: 3.8 KiB |
After Width: | Height: | Size: 2.8 KiB |
After Width: | Height: | Size: 5.8 KiB |
After Width: | Height: | Size: 3.8 KiB |
After Width: | Height: | Size: 7.6 KiB |
10
examples/llama.android/app/src/main/res/values/colors.xml
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<resources>
|
||||||
|
<color name="purple_200">#FFBB86FC</color>
|
||||||
|
<color name="purple_500">#FF6200EE</color>
|
||||||
|
<color name="purple_700">#FF3700B3</color>
|
||||||
|
<color name="teal_200">#FF03DAC5</color>
|
||||||
|
<color name="teal_700">#FF018786</color>
|
||||||
|
<color name="black">#FF000000</color>
|
||||||
|
<color name="white">#FFFFFFFF</color>
|
||||||
|
</resources>
|
|
@ -0,0 +1,3 @@
|
||||||
|
<resources>
|
||||||
|
<string name="app_name">LlamaAndroid</string>
|
||||||
|
</resources>
|
|
@ -0,0 +1,5 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<resources>
|
||||||
|
|
||||||
|
<style name="Theme.LlamaAndroid" parent="android:Theme.Material.Light.NoActionBar" />
|
||||||
|
</resources>
|
13
examples/llama.android/app/src/main/res/xml/backup_rules.xml
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?><!--
|
||||||
|
Sample backup rules file; uncomment and customize as necessary.
|
||||||
|
See https://developer.android.com/guide/topics/data/autobackup
|
||||||
|
for details.
|
||||||
|
Note: This file is ignored for devices older that API 31
|
||||||
|
See https://developer.android.com/about/versions/12/backup-restore
|
||||||
|
-->
|
||||||
|
<full-backup-content>
|
||||||
|
<!--
|
||||||
|
<include domain="sharedpref" path="."/>
|
||||||
|
<exclude domain="sharedpref" path="device.xml"/>
|
||||||
|
-->
|
||||||
|
</full-backup-content>
|
|
@ -0,0 +1,19 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?><!--
|
||||||
|
Sample data extraction rules file; uncomment and customize as necessary.
|
||||||
|
See https://developer.android.com/about/versions/12/backup-restore#xml-changes
|
||||||
|
for details.
|
||||||
|
-->
|
||||||
|
<data-extraction-rules>
|
||||||
|
<cloud-backup>
|
||||||
|
<!-- TODO: Use <include> and <exclude> to control what is backed up.
|
||||||
|
<include .../>
|
||||||
|
<exclude .../>
|
||||||
|
-->
|
||||||
|
</cloud-backup>
|
||||||
|
<!--
|
||||||
|
<device-transfer>
|
||||||
|
<include .../>
|
||||||
|
<exclude .../>
|
||||||
|
</device-transfer>
|
||||||
|
-->
|
||||||
|
</data-extraction-rules>
|
5
examples/llama.android/build.gradle.kts
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
// Top-level build file where you can add configuration options common to all sub-projects/modules.
|
||||||
|
plugins {
|
||||||
|
id("com.android.application") version "8.2.0" apply false
|
||||||
|
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
|
||||||
|
}
|
23
examples/llama.android/gradle.properties
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
# Project-wide Gradle settings.
|
||||||
|
# IDE (e.g. Android Studio) users:
|
||||||
|
# Gradle settings configured through the IDE *will override*
|
||||||
|
# any settings specified in this file.
|
||||||
|
# For more details on how to configure your build environment visit
|
||||||
|
# http://www.gradle.org/docs/current/userguide/build_environment.html
|
||||||
|
# Specifies the JVM arguments used for the daemon process.
|
||||||
|
# The setting is particularly useful for tweaking memory settings.
|
||||||
|
org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
|
||||||
|
# When configured, Gradle will run in incubating parallel mode.
|
||||||
|
# This option should only be used with decoupled projects. More details, visit
|
||||||
|
# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
|
||||||
|
# org.gradle.parallel=true
|
||||||
|
# AndroidX package structure to make it clearer which packages are bundled with the
|
||||||
|
# Android operating system, and which are packaged with your app's APK
|
||||||
|
# https://developer.android.com/topic/libraries/support-library/androidx-rn
|
||||||
|
android.useAndroidX=true
|
||||||
|
# Kotlin code style for this project: "official" or "obsolete":
|
||||||
|
kotlin.code.style=official
|
||||||
|
# Enables namespacing of each library's R class so that its R class includes only the
|
||||||
|
# resources declared in the library itself and none from the library's dependencies,
|
||||||
|
# thereby reducing the size of the R class for that library
|
||||||
|
android.nonTransitiveRClass=true
|
BIN
examples/llama.android/gradle/wrapper/gradle-wrapper.jar
vendored
Normal file
6
examples/llama.android/gradle/wrapper/gradle-wrapper.properties
vendored
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
#Thu Dec 21 14:31:09 AEDT 2023
|
||||||
|
distributionBase=GRADLE_USER_HOME
|
||||||
|
distributionPath=wrapper/dists
|
||||||
|
distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip
|
||||||
|
zipStoreBase=GRADLE_USER_HOME
|
||||||
|
zipStorePath=wrapper/dists
|
185
examples/llama.android/gradlew
vendored
Executable file
|
@ -0,0 +1,185 @@
|
||||||
|
#!/usr/bin/env sh
|
||||||
|
|
||||||
|
#
|
||||||
|
# Copyright 2015 the original author or authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# https://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
##
|
||||||
|
## Gradle start up script for UN*X
|
||||||
|
##
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
# Attempt to set APP_HOME
|
||||||
|
# Resolve links: $0 may be a link
|
||||||
|
PRG="$0"
|
||||||
|
# Need this for relative symlinks.
|
||||||
|
while [ -h "$PRG" ] ; do
|
||||||
|
ls=`ls -ld "$PRG"`
|
||||||
|
link=`expr "$ls" : '.*-> \(.*\)$'`
|
||||||
|
if expr "$link" : '/.*' > /dev/null; then
|
||||||
|
PRG="$link"
|
||||||
|
else
|
||||||
|
PRG=`dirname "$PRG"`"/$link"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
SAVED="`pwd`"
|
||||||
|
cd "`dirname \"$PRG\"`/" >/dev/null
|
||||||
|
APP_HOME="`pwd -P`"
|
||||||
|
cd "$SAVED" >/dev/null
|
||||||
|
|
||||||
|
APP_NAME="Gradle"
|
||||||
|
APP_BASE_NAME=`basename "$0"`
|
||||||
|
|
||||||
|
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
||||||
|
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
|
||||||
|
|
||||||
|
# Use the maximum available, or set MAX_FD != -1 to use that value.
|
||||||
|
MAX_FD="maximum"
|
||||||
|
|
||||||
|
warn () {
|
||||||
|
echo "$*"
|
||||||
|
}
|
||||||
|
|
||||||
|
die () {
|
||||||
|
echo
|
||||||
|
echo "$*"
|
||||||
|
echo
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# OS specific support (must be 'true' or 'false').
|
||||||
|
cygwin=false
|
||||||
|
msys=false
|
||||||
|
darwin=false
|
||||||
|
nonstop=false
|
||||||
|
case "`uname`" in
|
||||||
|
CYGWIN* )
|
||||||
|
cygwin=true
|
||||||
|
;;
|
||||||
|
Darwin* )
|
||||||
|
darwin=true
|
||||||
|
;;
|
||||||
|
MINGW* )
|
||||||
|
msys=true
|
||||||
|
;;
|
||||||
|
NONSTOP* )
|
||||||
|
nonstop=true
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
|
||||||
|
|
||||||
|
|
||||||
|
# Determine the Java command to use to start the JVM.
|
||||||
|
if [ -n "$JAVA_HOME" ] ; then
|
||||||
|
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
|
||||||
|
# IBM's JDK on AIX uses strange locations for the executables
|
||||||
|
JAVACMD="$JAVA_HOME/jre/sh/java"
|
||||||
|
else
|
||||||
|
JAVACMD="$JAVA_HOME/bin/java"
|
||||||
|
fi
|
||||||
|
if [ ! -x "$JAVACMD" ] ; then
|
||||||
|
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
|
||||||
|
|
||||||
|
Please set the JAVA_HOME variable in your environment to match the
|
||||||
|
location of your Java installation."
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
JAVACMD="java"
|
||||||
|
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
||||||
|
|
||||||
|
Please set the JAVA_HOME variable in your environment to match the
|
||||||
|
location of your Java installation."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Increase the maximum file descriptors if we can.
|
||||||
|
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
|
||||||
|
MAX_FD_LIMIT=`ulimit -H -n`
|
||||||
|
if [ $? -eq 0 ] ; then
|
||||||
|
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
|
||||||
|
MAX_FD="$MAX_FD_LIMIT"
|
||||||
|
fi
|
||||||
|
ulimit -n $MAX_FD
|
||||||
|
if [ $? -ne 0 ] ; then
|
||||||
|
warn "Could not set maximum file descriptor limit: $MAX_FD"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# For Darwin, add options to specify how the application appears in the dock
|
||||||
|
if $darwin; then
|
||||||
|
GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
|
||||||
|
fi
|
||||||
|
|
||||||
|
# For Cygwin or MSYS, switch paths to Windows format before running java
|
||||||
|
if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
|
||||||
|
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
|
||||||
|
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
|
||||||
|
|
||||||
|
JAVACMD=`cygpath --unix "$JAVACMD"`
|
||||||
|
|
||||||
|
# We build the pattern for arguments to be converted via cygpath
|
||||||
|
ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
|
||||||
|
SEP=""
|
||||||
|
for dir in $ROOTDIRSRAW ; do
|
||||||
|
ROOTDIRS="$ROOTDIRS$SEP$dir"
|
||||||
|
SEP="|"
|
||||||
|
done
|
||||||
|
OURCYGPATTERN="(^($ROOTDIRS))"
|
||||||
|
# Add a user-defined pattern to the cygpath arguments
|
||||||
|
if [ "$GRADLE_CYGPATTERN" != "" ] ; then
|
||||||
|
OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
|
||||||
|
fi
|
||||||
|
# Now convert the arguments - kludge to limit ourselves to /bin/sh
|
||||||
|
i=0
|
||||||
|
for arg in "$@" ; do
|
||||||
|
CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
|
||||||
|
CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
|
||||||
|
|
||||||
|
if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
|
||||||
|
eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
|
||||||
|
else
|
||||||
|
eval `echo args$i`="\"$arg\""
|
||||||
|
fi
|
||||||
|
i=`expr $i + 1`
|
||||||
|
done
|
||||||
|
case $i in
|
||||||
|
0) set -- ;;
|
||||||
|
1) set -- "$args0" ;;
|
||||||
|
2) set -- "$args0" "$args1" ;;
|
||||||
|
3) set -- "$args0" "$args1" "$args2" ;;
|
||||||
|
4) set -- "$args0" "$args1" "$args2" "$args3" ;;
|
||||||
|
5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
|
||||||
|
6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
|
||||||
|
7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
|
||||||
|
8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
|
||||||
|
9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
|
||||||
|
esac
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Escape application args
|
||||||
|
save () {
|
||||||
|
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
|
||||||
|
echo " "
|
||||||
|
}
|
||||||
|
APP_ARGS=`save "$@"`
|
||||||
|
|
||||||
|
# Collect all arguments for the java command, following the shell quoting and substitution rules
|
||||||
|
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
|
||||||
|
|
||||||
|
exec "$JAVACMD" "$@"
|
17
examples/llama.android/settings.gradle.kts
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
pluginManagement {
|
||||||
|
repositories {
|
||||||
|
google()
|
||||||
|
mavenCentral()
|
||||||
|
gradlePluginPortal()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dependencyResolutionManagement {
|
||||||
|
repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
|
||||||
|
repositories {
|
||||||
|
google()
|
||||||
|
mavenCentral()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
rootProject.name = "LlamaAndroid"
|
||||||
|
include(":app")
|
|
@ -1,4 +0,0 @@
|
||||||
set(TEST_TARGET metal)
|
|
||||||
add_executable(${TEST_TARGET} metal.cpp)
|
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
|
||||||
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
|
|
|
@ -1,103 +0,0 @@
|
||||||
// Evaluate a statically exported ggml computation graph with Metal
|
|
||||||
//
|
|
||||||
// - First, export a LLaMA graph:
|
|
||||||
//
|
|
||||||
// $ ./bin/main -m ../models/7B/ggml-model-q4_0.gguf --export
|
|
||||||
//
|
|
||||||
// - Run this tool to evaluate the exported graph:
|
|
||||||
//
|
|
||||||
// $ ./bin/metal llama.ggml
|
|
||||||
//
|
|
||||||
// The purpose of this tool is mostly for debugging and demonstration purposes.
|
|
||||||
// The main limitation of exporting computation graphs is that their sizes are static which often
|
|
||||||
// can be a problem for real-world applications.
|
|
||||||
//
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-metal.h"
|
|
||||||
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstring>
|
|
||||||
#include <cstdlib>
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
|
||||||
ggml_time_init();
|
|
||||||
|
|
||||||
if (argc != 2) {
|
|
||||||
fprintf(stderr, "Usage: %s llama.ggml\n", argv[0]);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
const char * fname_cgraph = argv[1];
|
|
||||||
|
|
||||||
// load the compute graph
|
|
||||||
struct ggml_context * ctx_data = NULL;
|
|
||||||
struct ggml_context * ctx_eval = NULL;
|
|
||||||
|
|
||||||
struct ggml_cgraph * gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
|
|
||||||
|
|
||||||
// this allocates all Metal resources and memory buffers
|
|
||||||
auto * ctx_metal = ggml_metal_init(1);
|
|
||||||
|
|
||||||
const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
|
|
||||||
const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
|
|
||||||
ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data), max_size_data);
|
|
||||||
ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval), max_size_eval);
|
|
||||||
|
|
||||||
// main
|
|
||||||
{
|
|
||||||
struct ggml_tensor * input = ggml_graph_get_tensor(gf, "embd");
|
|
||||||
*(int32_t *) input->data = 1; // BOS
|
|
||||||
|
|
||||||
ggml_metal_set_tensor(ctx_metal, input);
|
|
||||||
|
|
||||||
// warmup
|
|
||||||
ggml_metal_graph_compute(ctx_metal, gf);
|
|
||||||
|
|
||||||
const int n_iter = 16;
|
|
||||||
|
|
||||||
const int64_t t0 = ggml_time_us();
|
|
||||||
|
|
||||||
// the actual inference happens here
|
|
||||||
for (int i = 0; i < n_iter; ++i) {
|
|
||||||
ggml_metal_graph_compute(ctx_metal, gf);
|
|
||||||
}
|
|
||||||
|
|
||||||
const int64_t t1 = ggml_time_us();
|
|
||||||
|
|
||||||
printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter);
|
|
||||||
}
|
|
||||||
|
|
||||||
// debug output
|
|
||||||
{
|
|
||||||
struct ggml_tensor * logits = gf->nodes[gf->n_nodes - 1];
|
|
||||||
ggml_metal_get_tensor(ctx_metal, logits);
|
|
||||||
|
|
||||||
float * ptr = (float *) ggml_get_data(logits);
|
|
||||||
|
|
||||||
printf("logits: ");
|
|
||||||
for (int i = 0; i < 10; i++) {
|
|
||||||
printf("%8.4f ", ptr[i]);
|
|
||||||
}
|
|
||||||
printf("\n");
|
|
||||||
int imax = 0;
|
|
||||||
double sum = 0.0;
|
|
||||||
double vmax = -1e9;
|
|
||||||
for (int i = 0; i < 32000; i++) {
|
|
||||||
sum += (double) ptr[i];
|
|
||||||
if (ptr[i] > vmax) {
|
|
||||||
vmax = ptr[i];
|
|
||||||
imax = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_metal_free(ctx_metal);
|
|
||||||
|
|
||||||
ggml_free(ctx_data);
|
|
||||||
ggml_free(ctx_eval);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
|
@ -8,7 +8,11 @@
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
#include <atomic>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <array>
|
||||||
|
#include <fstream>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
@ -189,15 +193,19 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||||
|
|
||||||
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
|
const int n_batch = params.n_batch;
|
||||||
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
double nll = 0.0;
|
double nll = 0.0;
|
||||||
|
|
||||||
fprintf(stderr, "%s: calculating perplexity over %d chunks\n", __func__, n_chunk);
|
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
||||||
|
|
||||||
for (int i = 0; i < n_chunk; ++i) {
|
for (int i = 0; i < n_chunk; ++i) {
|
||||||
const int start = i * params.ppl_stride;
|
const int start = i * params.ppl_stride;
|
||||||
//const int end = start + calc_chunk;
|
const int end = start + calc_chunk;
|
||||||
|
|
||||||
|
const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
|
||||||
|
//fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
|
||||||
|
|
||||||
std::vector<float> logits;
|
std::vector<float> logits;
|
||||||
|
|
||||||
|
@ -206,25 +214,31 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||||
// clear the KV cache
|
// clear the KV cache
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
|
for (int j = 0; j < num_batches; ++j) {
|
||||||
|
const int batch_start = start + j * n_batch;
|
||||||
|
const int batch_size = std::min(end - batch_start, n_batch);
|
||||||
|
|
||||||
//fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
|
//fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
|
||||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + start, calc_chunk, 0, 0))) {
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
||||||
//fprintf(stderr, "%s : failed to eval\n", __func__);
|
//fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
return {tokens, -1, logit_history, prob_history};
|
return {tokens, -1, logit_history, prob_history};
|
||||||
}
|
}
|
||||||
|
|
||||||
// save original token and restore it after eval
|
// save original token and restore it after eval
|
||||||
const auto token_org = tokens[start];
|
const auto token_org = tokens[batch_start];
|
||||||
|
|
||||||
// add BOS token for the first batch of each chunk
|
// add BOS token for the first batch of each chunk
|
||||||
if (add_bos) {
|
if (add_bos && j == 0) {
|
||||||
tokens[start] = llama_token_bos(llama_get_model(ctx));
|
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto * batch_logits = llama_get_logits(ctx);
|
const auto batch_logits = llama_get_logits(ctx);
|
||||||
logits.insert(logits.end(), batch_logits, batch_logits + calc_chunk * n_vocab);
|
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
|
||||||
|
|
||||||
tokens[start] = token_org;
|
if (j == 0) {
|
||||||
|
tokens[batch_start] = token_org;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
@ -236,8 +250,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||||
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
||||||
total_seconds = total_seconds % (60*60);
|
total_seconds = total_seconds % (60*60);
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%.2f minutes ", total_seconds / 60.0);
|
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
||||||
fprintf(stderr, "(%.2f t/s)\n", n_ctx/t_total);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
|
//fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
|
||||||
|
@ -312,50 +325,55 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
double nll = 0.0;
|
double nll = 0.0;
|
||||||
double nll2 = 0.0;
|
double nll2 = 0.0;
|
||||||
|
|
||||||
|
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
|
||||||
|
|
||||||
|
std::vector<float> logits;
|
||||||
|
if (num_batches > 1) {
|
||||||
|
logits.reserve((size_t)n_ctx * n_vocab);
|
||||||
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
||||||
|
|
||||||
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
||||||
|
|
||||||
for (int i = 0; i < n_chunk; ++i) {
|
for (int i = 0; i < n_chunk; ++i) {
|
||||||
const int start = i * n_ctx;
|
const int start = i * n_ctx;
|
||||||
//const int end = start + n_ctx;
|
const int end = start + n_ctx;
|
||||||
|
|
||||||
//const int num_batches = (n_ctx + n_batch - 1) / n_batch;
|
|
||||||
|
|
||||||
std::vector<float> logits;
|
|
||||||
|
|
||||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
// clear the KV cache
|
// clear the KV cache
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
//for (int j = 0; j < num_batches; ++j) {
|
for (int j = 0; j < num_batches; ++j) {
|
||||||
// const int batch_start = start + j * n_batch;
|
const int batch_start = start + j * n_batch;
|
||||||
// const int batch_size = std::min(end - batch_start, n_batch);
|
const int batch_size = std::min(end - batch_start, n_batch);
|
||||||
|
|
||||||
// save original token and restore it after eval
|
// save original token and restore it after eval
|
||||||
const auto token_org = tokens[start];
|
const auto token_org = tokens[batch_start];
|
||||||
|
|
||||||
// add BOS token for the first batch of each chunk
|
// add BOS token for the first batch of each chunk
|
||||||
if (add_bos) {
|
if (add_bos && j == 0) {
|
||||||
tokens[start] = llama_token_bos(llama_get_model(ctx));
|
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + start, n_ctx, 0, 0))) {
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
return {tokens, -1, logit_history, prob_history};
|
return {tokens, -1, logit_history, prob_history};
|
||||||
}
|
}
|
||||||
|
|
||||||
// restore the original token in case it was set to BOS
|
// restore the original token in case it was set to BOS
|
||||||
tokens[start] = token_org;
|
tokens[batch_start] = token_org;
|
||||||
|
|
||||||
|
if (num_batches > 1) {
|
||||||
const auto * batch_logits = llama_get_logits(ctx);
|
const auto * batch_logits = llama_get_logits(ctx);
|
||||||
logits.insert(logits.end(), batch_logits, batch_logits + n_ctx * n_vocab);
|
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
|
||||||
//}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
if (i == 1) { // TODO: skipping the first chunk gives a better estimate, but breaks formatting
|
if (i == 0) {
|
||||||
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||||
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||||
int total_seconds = (int)(t_total * n_chunk);
|
int total_seconds = (int)(t_total * n_chunk);
|
||||||
|
@ -363,9 +381,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
||||||
total_seconds = total_seconds % (60*60);
|
total_seconds = total_seconds % (60*60);
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%.2f minutes ", total_seconds / 60.0);
|
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
||||||
fprintf(stderr, "(%.2f t/s)\n", n_ctx/t_total);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// We get the logits for all the tokens in the context window (params.n_ctx)
|
// We get the logits for all the tokens in the context window (params.n_ctx)
|
||||||
|
@ -381,7 +397,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
// last 256 tokens. Then, we split the input up into context window size chunks to
|
// last 256 tokens. Then, we split the input up into context window size chunks to
|
||||||
// process the entire prompt.
|
// process the entire prompt.
|
||||||
const int first = n_ctx/2;
|
const int first = n_ctx/2;
|
||||||
process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
||||||
|
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
||||||
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
|
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
|
||||||
count += n_ctx - first - 1;
|
count += n_ctx - first - 1;
|
||||||
|
|
||||||
|
@ -395,6 +412,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
|
printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
|
logits.clear();
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
|
||||||
|
@ -412,26 +431,73 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
return {tokens, ppl, logit_history, prob_history};
|
return {tokens, ppl, logit_history, prob_history};
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::vector<float> hellaswag_evaluate_tokens(
|
static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int32_t n_batch, int32_t n_vocab) {
|
||||||
llama_context * ctx, std::vector<int> & tokens, int n_past, int n_batch, int n_vocab
|
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
||||||
) {
|
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||||
std::vector<float> result;
|
|
||||||
result.reserve(tokens.size() * n_vocab);
|
llama_batch batch_view = {
|
||||||
size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
|
n_tokens,
|
||||||
for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
|
batch.token + i,
|
||||||
size_t n_tokens = tokens.size() - i_chunk * n_batch;
|
nullptr,
|
||||||
n_tokens = std::min(n_tokens, size_t(n_batch));
|
batch.pos + i,
|
||||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + i_chunk * n_batch, n_tokens, n_past, 0))) {
|
batch.n_seq_id + i,
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
batch.seq_id + i,
|
||||||
return {};
|
batch.logits + i,
|
||||||
|
0, 0, 0, // unused
|
||||||
|
};
|
||||||
|
|
||||||
|
const int ret = llama_decode(ctx, batch_view);
|
||||||
|
if (ret != 0) {
|
||||||
|
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto * logits = llama_get_logits(ctx);
|
memcpy(batch_logits.data() + i*n_vocab, llama_get_logits(ctx), n_tokens*n_vocab*sizeof(float));
|
||||||
result.insert(result.end(), logits, logits + n_tokens * n_vocab);
|
}
|
||||||
|
|
||||||
n_past += n_tokens;
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define K_TOKEN_CHUNK 4
|
||||||
|
|
||||||
|
static void compute_logprobs(const float * batch_logits, int n_vocab, std::vector<std::thread>& workers,
|
||||||
|
const std::vector<std::pair<size_t, llama_token>>& eval_pairs, std::vector<float>& eval_results) {
|
||||||
|
if (eval_results.size() != eval_pairs.size()) {
|
||||||
|
eval_results.resize(eval_pairs.size());
|
||||||
|
}
|
||||||
|
if (eval_pairs.empty()) return;
|
||||||
|
|
||||||
|
size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size());
|
||||||
|
|
||||||
|
std::atomic<int> counter(0);
|
||||||
|
auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
|
||||||
|
float local_logprobs[K_TOKEN_CHUNK];
|
||||||
|
while (true) {
|
||||||
|
size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
|
||||||
|
if (first >= eval_results.size()) break;
|
||||||
|
size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
|
||||||
|
for (size_t i = first; i < last; ++i) {
|
||||||
|
auto logits = batch_logits + eval_pairs[i].first * n_vocab;
|
||||||
|
float max_logit = logits[0];
|
||||||
|
for (int j = 1; j < n_vocab; ++j) {
|
||||||
|
max_logit = std::max(max_logit, logits[j]);
|
||||||
|
}
|
||||||
|
float sum_p = 0.f;
|
||||||
|
for (int j = 0; j < n_vocab; ++j) {
|
||||||
|
sum_p += expf(logits[j] - max_logit);
|
||||||
|
}
|
||||||
|
local_logprobs[i - first] = logits[eval_pairs[i].second] - max_logit - std::log(sum_p);
|
||||||
|
}
|
||||||
|
std::memcpy(eval_results.data() + first, local_logprobs, (last - first)*sizeof(float));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
for (size_t it = 0; it < max_threads; ++it) {
|
||||||
|
workers[it] = std::thread(compute);
|
||||||
|
}
|
||||||
|
for (size_t it = 0; it < max_threads; ++it) {
|
||||||
|
workers[it].join();
|
||||||
}
|
}
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
|
@ -492,27 +558,53 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
std::string ending[4];
|
std::string ending[4];
|
||||||
size_t ending_logprob_count[4];
|
size_t ending_logprob_count[4];
|
||||||
double ending_logprob[4];
|
double ending_logprob[4];
|
||||||
|
|
||||||
|
size_t i_batch; // starting index in the llama_batch
|
||||||
|
size_t common_prefix; // max number of initial tokens that are the same in all sentences
|
||||||
|
size_t required_tokens; // needed number of tokens to evaluate all 4 endings
|
||||||
|
std::vector<llama_token> seq_tokens[4];
|
||||||
};
|
};
|
||||||
|
|
||||||
fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
|
fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
|
||||||
|
|
||||||
// Select and read data from prompt lines
|
// Select and read data from prompt lines
|
||||||
hs_data_t *hs_data = new hs_data_t[hs_task_count];
|
std::vector<hs_data_t> hs_data(hs_task_count);
|
||||||
for (size_t i = 0; i < hs_task_count; i++) {
|
for (size_t i = 0; i < hs_task_count; i++) {
|
||||||
size_t idx = i;
|
size_t idx = i;
|
||||||
|
|
||||||
|
auto & hs_cur = hs_data[i];
|
||||||
|
|
||||||
// Select a random example of those left in the prompt
|
// Select a random example of those left in the prompt
|
||||||
if (randomize_tasks) {
|
if (randomize_tasks) {
|
||||||
std::uniform_int_distribution<size_t> dist(0, prompt_lines.size()/6-1 ) ;
|
std::uniform_int_distribution<size_t> dist(0, prompt_lines.size()/6-1 ) ;
|
||||||
idx = dist(rng);
|
idx = dist(rng);
|
||||||
}
|
}
|
||||||
|
|
||||||
hs_data[i].context = prompt_lines[idx*6];
|
hs_cur.context = prompt_lines[idx*6];
|
||||||
hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
||||||
for (size_t j = 0; j < 4; j++) {
|
for (size_t j = 0; j < 4; j++) {
|
||||||
hs_data[i].ending[j] = prompt_lines[idx*6+2+j];
|
hs_cur.ending[j] = prompt_lines[idx*6+2+j];
|
||||||
|
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], add_bos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// determine the common prefix of the endings
|
||||||
|
hs_cur.common_prefix = 0;
|
||||||
|
for (size_t k = 0; k < hs_cur.seq_tokens[0].size(); k++) {
|
||||||
|
if (hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[1][k] ||
|
||||||
|
hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[2][k] ||
|
||||||
|
hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[3][k]) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
hs_cur.common_prefix++;
|
||||||
|
}
|
||||||
|
hs_cur.required_tokens = hs_cur.common_prefix +
|
||||||
|
hs_cur.seq_tokens[0].size() - hs_cur.common_prefix +
|
||||||
|
hs_cur.seq_tokens[1].size() - hs_cur.common_prefix +
|
||||||
|
hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
|
||||||
|
hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
|
||||||
|
|
||||||
|
//GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, add_bos).size());
|
||||||
|
|
||||||
// Delete the selected random example from the prompt
|
// Delete the selected random example from the prompt
|
||||||
if (randomize_tasks) {
|
if (randomize_tasks) {
|
||||||
prompt_lines.erase( std::next(prompt_lines.begin(),idx*6) , std::next(prompt_lines.begin(),idx*6+6) );
|
prompt_lines.erase( std::next(prompt_lines.begin(),idx*6) , std::next(prompt_lines.begin(),idx*6+6) );
|
||||||
|
@ -520,164 +612,436 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
|
fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
|
||||||
|
|
||||||
printf("\ntask\tacc_norm\n");
|
printf("\ntask\tacc_norm\n");
|
||||||
|
|
||||||
double acc = 0.0f;
|
double acc = 0.0f;
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
const int n_batch = params.n_batch;
|
||||||
|
|
||||||
std::vector<std::vector<int>> ending_tokens(4);
|
const int max_tasks_per_batch = 32;
|
||||||
|
const int max_seq = 4*max_tasks_per_batch;
|
||||||
|
|
||||||
|
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
|
||||||
|
|
||||||
std::vector<float> tok_logits(n_vocab);
|
std::vector<float> tok_logits(n_vocab);
|
||||||
|
std::vector<float> batch_logits(n_vocab*n_ctx);
|
||||||
|
|
||||||
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
|
std::vector<std::pair<size_t, llama_token>> eval_pairs;
|
||||||
// Tokenize the context to count tokens
|
std::vector<float> eval_results;
|
||||||
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos);
|
std::vector<std::thread> workers(std::thread::hardware_concurrency());
|
||||||
size_t context_size = context_embd.size();
|
|
||||||
|
|
||||||
for (int i = 0; i < 4; ++i) {
|
for (size_t i0 = 0; i0 < hs_task_count; i0++) {
|
||||||
ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + " " + hs_data[task_idx].ending[i], add_bos);
|
int n_cur = 0;
|
||||||
for (int k = 0; k < int(context_size); ++k) {
|
|
||||||
if (ending_tokens[i][k] != context_embd[k]) {
|
size_t i1 = i0;
|
||||||
fprintf(stderr, "Oops: ending %d of task %d differs from context at position %d\n",i,int(task_idx),k);
|
size_t i_batch = 0; // this tells us where in `llama_batch` we are currently
|
||||||
|
|
||||||
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
|
// batch as much tasks as possible into the available context
|
||||||
|
// each task has 4 unique seuqnce ids - one for each ending
|
||||||
|
// the common prefix is shared among the 4 sequences to save tokens
|
||||||
|
// we extract logits only from the last common token and from all ending tokens of each sequence
|
||||||
|
while (n_cur + (int) hs_data[i1].required_tokens <= n_ctx) {
|
||||||
|
auto & hs_cur = hs_data[i1];
|
||||||
|
|
||||||
|
const int s0 = 4*(i1 - i0);
|
||||||
|
if (s0 + 4 > max_seq) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 0; i < hs_cur.common_prefix; ++i) {
|
||||||
|
llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
|
||||||
|
}
|
||||||
|
batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
|
||||||
|
|
||||||
|
for (int s = 0; s < 4; ++s) {
|
||||||
|
for (size_t i = hs_cur.common_prefix; i < hs_cur.seq_tokens[s].size(); ++i) {
|
||||||
|
llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
hs_cur.i_batch = i_batch;
|
||||||
|
i_batch += hs_cur.required_tokens;
|
||||||
|
|
||||||
|
n_cur += hs_data[i1].required_tokens;
|
||||||
|
if (++i1 == hs_task_count) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Do the 1st ending
|
if (i0 == i1) {
|
||||||
// In this case we include the context when evaluating
|
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
|
||||||
//auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
|
|
||||||
auto query_embd = ending_tokens[0];
|
|
||||||
auto query_size = query_embd.size();
|
|
||||||
|
|
||||||
// Stop if query wont fit the ctx window
|
|
||||||
if (query_size > (size_t)n_ctx) {
|
|
||||||
fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Speedup small evaluations by evaluating atleast 32 tokens
|
|
||||||
if (query_size < 32) {
|
|
||||||
query_embd.resize(32);
|
|
||||||
}
|
|
||||||
|
|
||||||
// clear the KV cache
|
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
|
// decode all tasks [i0, i1)
|
||||||
if (logits.empty()) {
|
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::memcpy(tok_logits.data(), logits.data() + (context_size-1)*n_vocab, n_vocab*sizeof(float));
|
// Compute log-probs in parallel
|
||||||
|
// First we collect all tasks
|
||||||
|
eval_pairs.clear();
|
||||||
|
for (size_t i = i0; i < i1; ++i) {
|
||||||
|
auto & hs_cur = hs_data[i];
|
||||||
|
size_t li = hs_cur.common_prefix;
|
||||||
|
for (int s = 0; s < 4; ++s) {
|
||||||
|
for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
|
||||||
|
eval_pairs.push_back(std::make_pair(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]));
|
||||||
|
}
|
||||||
|
++li;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Then we do the actual calculation
|
||||||
|
compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
|
||||||
|
|
||||||
|
size_t ir = 0;
|
||||||
|
|
||||||
|
// compute the logprobs for each ending of the decoded tasks
|
||||||
|
for (size_t i = i0; i < i1; ++i) {
|
||||||
|
auto & hs_cur = hs_data[i];
|
||||||
|
|
||||||
|
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(hs_cur.i_batch + hs_cur.common_prefix - 1), n_vocab*sizeof(float));
|
||||||
|
|
||||||
const auto first_probs = softmax(tok_logits);
|
const auto first_probs = softmax(tok_logits);
|
||||||
|
|
||||||
hs_data[task_idx].ending_logprob_count[0] = 1;
|
for (int s = 0; s < 4; ++s) {
|
||||||
hs_data[task_idx].ending_logprob[0] = std::log(first_probs[query_embd[context_size]]);
|
hs_cur.ending_logprob_count[s] = 1;
|
||||||
|
hs_cur.ending_logprob[s] = std::log(first_probs[hs_cur.seq_tokens[s][hs_cur.common_prefix]]);
|
||||||
// Calculate the logprobs over the ending
|
for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
|
||||||
for (size_t j = context_size; j < query_size - 1; j++) {
|
hs_cur.ending_logprob[s] += eval_results[ir++];
|
||||||
|
hs_cur.ending_logprob_count[s]++;
|
||||||
std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
|
|
||||||
|
|
||||||
const float prob = softmax(tok_logits)[query_embd[j + 1]];
|
|
||||||
|
|
||||||
hs_data[task_idx].ending_logprob[0] += std::log(prob);
|
|
||||||
hs_data[task_idx].ending_logprob_count[0]++;
|
|
||||||
}
|
}
|
||||||
|
hs_cur.ending_logprob[s] /= hs_cur.ending_logprob_count[s];
|
||||||
// Calculate the mean token logprob for acc_norm
|
|
||||||
hs_data[task_idx].ending_logprob[0] /= hs_data[task_idx].ending_logprob_count[0];
|
|
||||||
|
|
||||||
// Do the remaining endings
|
|
||||||
// For these, we use the bare ending with n_past = context_size
|
|
||||||
//
|
|
||||||
for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
|
|
||||||
|
|
||||||
// Tokenize the query
|
|
||||||
query_embd.resize(ending_tokens[ending_idx].size() - context_size);
|
|
||||||
std::memcpy(query_embd.data(), ending_tokens[ending_idx].data() + context_size, query_embd.size()*sizeof(int));
|
|
||||||
query_size = query_embd.size();
|
|
||||||
|
|
||||||
// Stop if query wont fit the ctx window
|
|
||||||
if (context_size + query_size > (size_t)n_ctx) {
|
|
||||||
fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Speedup small evaluations by evaluating atleast 32 tokens
|
|
||||||
// No, resizing to 32 is actually slightly slower (at least on CUDA)
|
|
||||||
//if (query_size < 32) {
|
|
||||||
// query_embd.resize(32);
|
|
||||||
//}
|
|
||||||
|
|
||||||
// Evaluate the query
|
|
||||||
logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab);
|
|
||||||
if (logits.empty()) {
|
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
hs_data[task_idx].ending_logprob_count[ending_idx] = 1;
|
|
||||||
hs_data[task_idx].ending_logprob[ending_idx] = std::log(first_probs[query_embd[0]]);
|
|
||||||
|
|
||||||
// Calculate the logprobs over the ending
|
|
||||||
for (size_t j = 0; j < query_size - 1; j++) {
|
|
||||||
std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
|
|
||||||
|
|
||||||
const float prob = softmax(tok_logits)[query_embd[j + 1]];
|
|
||||||
|
|
||||||
hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob);
|
|
||||||
hs_data[task_idx].ending_logprob_count[ending_idx]++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Calculate the mean token logprob for acc_norm
|
|
||||||
hs_data[task_idx].ending_logprob[ending_idx] /= hs_data[task_idx].ending_logprob_count[ending_idx];
|
|
||||||
|
|
||||||
|
|
||||||
// printf("task %lu, ending %lu, whole_len %lu, context_len %lu, ending_logprob_count %lu, ending_logprob %.4f\n",
|
|
||||||
// task_idx,ending_idx,whole_size,context_size, hs_data[task_idx].ending_logprob_count[ending_idx], hs_data[task_idx].ending_logprob[ending_idx] );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find the ending with maximum logprob
|
// Find the ending with maximum logprob
|
||||||
size_t ending_logprob_max_idx = 0;
|
size_t ending_logprob_max_idx = 0;
|
||||||
double ending_logprob_max_val = hs_data[task_idx].ending_logprob[0];
|
double ending_logprob_max_val = hs_cur.ending_logprob[0];
|
||||||
for (size_t j = 1; j < 4; j++) {
|
for (size_t s = 1; s < 4; s++) {
|
||||||
if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
|
if (hs_cur.ending_logprob[s] > ending_logprob_max_val) {
|
||||||
ending_logprob_max_idx = j;
|
ending_logprob_max_idx = s;
|
||||||
ending_logprob_max_val = hs_data[task_idx].ending_logprob[j];
|
ending_logprob_max_val = hs_cur.ending_logprob[s];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_data[task_idx].gold_ending_idx);
|
//printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
|
||||||
|
|
||||||
// If the gold ending got the maximum logprobe add one accuracy point
|
// If the gold ending got the maximum logprobe add one accuracy point
|
||||||
if (ending_logprob_max_idx == hs_data[task_idx].gold_ending_idx) {
|
if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
|
||||||
acc += 1.0;
|
acc += 1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Print the accumulated accuracy mean x 100
|
// Print the accumulated accuracy mean x 100
|
||||||
printf("%zu\t%.8lf\n",task_idx+1, acc/double(task_idx+1)*100.0);
|
printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
|
||||||
delete [] hs_data;
|
i0 = i1 - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_batch_free(batch);
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct winogrande_entry {
|
||||||
|
std::string first;
|
||||||
|
std::string second;
|
||||||
|
std::array<std::string, 2> choices;
|
||||||
|
int answer;
|
||||||
|
|
||||||
|
size_t i_batch;
|
||||||
|
size_t common_prefix;
|
||||||
|
size_t required_tokens;
|
||||||
|
size_t n_base1; // number of tokens for context + choice 1
|
||||||
|
size_t n_base2; // number of tokens for context + choice 2
|
||||||
|
std::vector<llama_token> seq_tokens[2];
|
||||||
|
};
|
||||||
|
|
||||||
|
static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string& prompt) {
|
||||||
|
std::vector<winogrande_entry> result;
|
||||||
|
std::istringstream in(prompt);
|
||||||
|
std::string line;
|
||||||
|
std::array<int, 4> comma_pos;
|
||||||
|
while (true) {
|
||||||
|
std::getline(in, line);
|
||||||
|
if (in.fail() || in.eof()) break;
|
||||||
|
int ipos = 0;
|
||||||
|
bool quote_open = false;
|
||||||
|
for (int i = 0; i < int(line.size()); ++i) {
|
||||||
|
if (!quote_open) {
|
||||||
|
if (line[i] == ',') {
|
||||||
|
comma_pos[ipos++] = i;
|
||||||
|
if (ipos == 4) break;
|
||||||
|
}
|
||||||
|
else if (line[i] == '"') {
|
||||||
|
quote_open = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (line[i] == '"') {
|
||||||
|
quote_open = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (ipos != 4) {
|
||||||
|
printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
|
||||||
|
: line.substr(comma_pos[0]+1, comma_pos[1] - comma_pos[0] - 1);
|
||||||
|
auto choice1 = line.substr(comma_pos[1]+1, comma_pos[2] - comma_pos[1] - 1);
|
||||||
|
auto choice2 = line.substr(comma_pos[2]+1, comma_pos[3] - comma_pos[2] - 1);
|
||||||
|
auto answer = line.substr(comma_pos[3]+1, line.size() - comma_pos[3] - 1);
|
||||||
|
auto index = line.substr(0, comma_pos[0]);
|
||||||
|
int where = 0;
|
||||||
|
for ( ; where < int(sentence.size()); ++where) {
|
||||||
|
if (sentence[where] == '_') break;
|
||||||
|
}
|
||||||
|
if (where == int(sentence.size())) {
|
||||||
|
printf("%s: no _ in <%s>\n", __func__, sentence.c_str());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
std::istringstream stream(answer.c_str());
|
||||||
|
int i_answer; stream >> i_answer;
|
||||||
|
if (stream.fail() || i_answer < 1 || i_answer > 2) {
|
||||||
|
printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
result.emplace_back();
|
||||||
|
auto& wg = result.back();
|
||||||
|
wg.first = sentence.substr(0, where);
|
||||||
|
wg.second = sentence.substr(where + 1, sentence.size() - where - 1);
|
||||||
|
wg.choices[0] = std::move(choice1);
|
||||||
|
wg.choices[1] = std::move(choice2);
|
||||||
|
wg.answer = i_answer;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Evaluates the Winogrande score.
|
||||||
|
* Uses a CSV containing task index, dentence, choice 1, choice 2, answer (1 or 2)
|
||||||
|
* You can get one such dataset from e.g. https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-llama.cpp
|
||||||
|
* As an example, the 1st row in the above dataset is
|
||||||
|
*
|
||||||
|
* 0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||||
|
|
||||||
|
constexpr int k_min_trailing_ctx = 3;
|
||||||
|
|
||||||
|
auto data = load_winogrande_from_csv(params.prompt);
|
||||||
|
if (data.empty()) {
|
||||||
|
fprintf(stderr, "%s: no tasks\n", __func__);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size());
|
||||||
|
|
||||||
|
if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
|
||||||
|
fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
|
||||||
|
std::mt19937 rng(1);
|
||||||
|
std::vector<int> aux(data.size());
|
||||||
|
for (int i = 0; i < int(data.size()); ++i) {
|
||||||
|
aux[i] = i;
|
||||||
|
}
|
||||||
|
float scale = 1/(1.f + (float)rng.max());
|
||||||
|
std::vector<winogrande_entry> selected;
|
||||||
|
selected.resize(params.winogrande_tasks);
|
||||||
|
for (int i = 0; i < int(params.winogrande_tasks); ++i) {
|
||||||
|
int j = int(scale*rng()*aux.size());
|
||||||
|
selected[i] = std::move(data[aux[j]]);
|
||||||
|
aux[j] = aux.back();
|
||||||
|
aux.pop_back();
|
||||||
|
}
|
||||||
|
data = std::move(selected);
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
|
||||||
|
|
||||||
|
// This is needed as usual for LLaMA models
|
||||||
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
|
||||||
|
for (auto & task : data) {
|
||||||
|
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, add_bos);
|
||||||
|
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, add_bos);
|
||||||
|
|
||||||
|
task.common_prefix = 0;
|
||||||
|
for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
|
||||||
|
if (task.seq_tokens[0][k] != task.seq_tokens[1][k]) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
task.common_prefix++;
|
||||||
|
}
|
||||||
|
|
||||||
|
task.required_tokens = task.common_prefix +
|
||||||
|
task.seq_tokens[0].size() - task.common_prefix +
|
||||||
|
task.seq_tokens[1].size() - task.common_prefix;
|
||||||
|
|
||||||
|
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], add_bos).size();
|
||||||
|
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], add_bos).size();
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
|
||||||
|
|
||||||
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
const int n_batch = params.n_batch;
|
||||||
|
|
||||||
|
const int max_tasks_per_batch = 128;
|
||||||
|
const int max_seq = 2*max_tasks_per_batch;
|
||||||
|
|
||||||
|
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
|
||||||
|
|
||||||
|
std::vector<float> tok_logits(n_vocab);
|
||||||
|
std::vector<float> batch_logits(n_vocab*n_ctx);
|
||||||
|
|
||||||
|
std::vector<std::pair<size_t, llama_token>> eval_pairs;
|
||||||
|
std::vector<float> eval_results;
|
||||||
|
std::vector<std::thread> workers(std::thread::hardware_concurrency());
|
||||||
|
|
||||||
|
int n_correct = 0;
|
||||||
|
int n_done = 0;
|
||||||
|
|
||||||
|
for (size_t i0 = 0; i0 < data.size(); i0++) {
|
||||||
|
int n_cur = 0;
|
||||||
|
|
||||||
|
size_t i1 = i0;
|
||||||
|
size_t i_batch = 0;
|
||||||
|
|
||||||
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
|
while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
|
||||||
|
const int s0 = 2*(i1 - i0);
|
||||||
|
if (s0 + 2 > max_seq) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 0; i < data[i1].common_prefix; ++i) {
|
||||||
|
llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1}, false);
|
||||||
|
}
|
||||||
|
batch.logits[batch.n_tokens - 1] = true;
|
||||||
|
|
||||||
|
for (int s = 0; s < 2; ++s) {
|
||||||
|
for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) {
|
||||||
|
llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
data[i1].i_batch = i_batch;
|
||||||
|
i_batch += data[i1].required_tokens;
|
||||||
|
|
||||||
|
n_cur += data[i1].required_tokens;
|
||||||
|
if (++i1 == data.size()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i0 == i1) {
|
||||||
|
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
|
// decode all tasks [i0, i1)
|
||||||
|
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
||||||
|
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
eval_pairs.clear();
|
||||||
|
for (size_t i = i0; i < i1; ++i) {
|
||||||
|
auto & task = data[i];
|
||||||
|
|
||||||
|
const bool skip_choice =
|
||||||
|
task.seq_tokens[0].size() - task.common_prefix > k_min_trailing_ctx &&
|
||||||
|
task.seq_tokens[1].size() - task.common_prefix > k_min_trailing_ctx;
|
||||||
|
|
||||||
|
const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix;
|
||||||
|
const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
|
||||||
|
size_t li = n_base1 - 1;
|
||||||
|
for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
|
||||||
|
eval_pairs.push_back(std::make_pair(task.i_batch + li++, task.seq_tokens[0][j+1]));
|
||||||
|
}
|
||||||
|
const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
|
||||||
|
const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
|
||||||
|
li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1;
|
||||||
|
for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
|
||||||
|
eval_pairs.push_back(std::make_pair(task.i_batch + li++, task.seq_tokens[1][j+1]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
|
||||||
|
|
||||||
|
size_t ir = 0;
|
||||||
|
for (size_t i = i0; i < i1; ++i) {
|
||||||
|
auto & task = data[i];
|
||||||
|
|
||||||
|
const bool skip_choice =
|
||||||
|
task.seq_tokens[0].size() - task.common_prefix > k_min_trailing_ctx &&
|
||||||
|
task.seq_tokens[1].size() - task.common_prefix > k_min_trailing_ctx;
|
||||||
|
|
||||||
|
float score_1st = 0;
|
||||||
|
const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix;
|
||||||
|
const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
|
||||||
|
for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
|
||||||
|
score_1st += eval_results[ir++];
|
||||||
|
}
|
||||||
|
score_1st /= (task.seq_tokens[0].size() - n_base1 - last_1st);
|
||||||
|
|
||||||
|
float score_2nd = 0;
|
||||||
|
const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
|
||||||
|
const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
|
||||||
|
for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
|
||||||
|
score_2nd += eval_results[ir++];
|
||||||
|
}
|
||||||
|
score_2nd /= (task.seq_tokens[1].size() - n_base2 - last_2nd);
|
||||||
|
|
||||||
|
int result = score_1st > score_2nd ? 1 : 2;
|
||||||
|
|
||||||
|
if (result == task.answer) {
|
||||||
|
++n_correct;
|
||||||
|
}
|
||||||
|
++n_done;
|
||||||
|
|
||||||
|
// print the accumulated accuracy mean x 100
|
||||||
|
printf("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
|
||||||
|
i0 = i1 - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
if (n_done < 100) return;
|
||||||
|
|
||||||
|
const float p = 1.f*n_correct/n_done;
|
||||||
|
const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
|
||||||
|
printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
//params.n_batch = 512;
|
params.n_batch = 512;
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
//params.n_batch = std::min(params.n_batch, params.n_ctx);
|
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||||
|
|
||||||
if (params.ppl_stride > 0) {
|
if (params.ppl_stride > 0) {
|
||||||
fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
|
fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
|
||||||
|
@ -725,6 +1089,8 @@ int main(int argc, char ** argv) {
|
||||||
struct results_perplexity results;
|
struct results_perplexity results;
|
||||||
if (params.hellaswag) {
|
if (params.hellaswag) {
|
||||||
hellaswag_score(ctx, params);
|
hellaswag_score(ctx, params);
|
||||||
|
} else if (params.winogrande) {
|
||||||
|
winogrande_score(ctx, params);
|
||||||
} else {
|
} else {
|
||||||
results = perplexity(ctx, params);
|
results = perplexity(ctx, params);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Function calling example using pydantic models.
|
# Function calling example using pydantic models.
|
||||||
|
import datetime
|
||||||
import json
|
import json
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Union, Optional
|
from typing import Union, Optional
|
||||||
|
@ -8,7 +8,8 @@ import requests
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
import importlib
|
import importlib
|
||||||
from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation
|
from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation, convert_dictionary_to_pydantic_model, add_run_method_to_dynamic_model, create_dynamic_model_from_function
|
||||||
|
|
||||||
|
|
||||||
# Function to get completion on the llama.cpp server with grammar.
|
# Function to get completion on the llama.cpp server with grammar.
|
||||||
def create_completion(prompt, grammar):
|
def create_completion(prompt, grammar):
|
||||||
|
@ -134,3 +135,121 @@ text = create_completion(prompt=prompt, grammar=gbnf_grammar)
|
||||||
json_data = json.loads(text)
|
json_data = json.loads(text)
|
||||||
|
|
||||||
print(Book(**json_data))
|
print(Book(**json_data))
|
||||||
|
# An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition.
|
||||||
|
|
||||||
|
def get_current_datetime(output_format: Optional[str] = None):
|
||||||
|
"""
|
||||||
|
Get the current date and time in the given format.
|
||||||
|
Args:
|
||||||
|
output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
|
||||||
|
"""
|
||||||
|
if output_format is None:
|
||||||
|
output_format = '%Y-%m-%d %H:%M:%S'
|
||||||
|
return datetime.datetime.now().strftime(output_format)
|
||||||
|
|
||||||
|
|
||||||
|
# Enum for the calculator tool.
|
||||||
|
class MathOperation(Enum):
|
||||||
|
ADD = "add"
|
||||||
|
SUBTRACT = "subtract"
|
||||||
|
MULTIPLY = "multiply"
|
||||||
|
DIVIDE = "divide"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
|
||||||
|
class Calculator(BaseModel):
|
||||||
|
"""
|
||||||
|
Perform a math operation on two numbers.
|
||||||
|
"""
|
||||||
|
number_one: Union[int, float] = Field(..., description="First number.")
|
||||||
|
operation: MathOperation = Field(..., description="Math operation to perform.")
|
||||||
|
number_two: Union[int, float] = Field(..., description="Second number.")
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
if self.operation == MathOperation.ADD:
|
||||||
|
return self.number_one + self.number_two
|
||||||
|
elif self.operation == MathOperation.SUBTRACT:
|
||||||
|
return self.number_one - self.number_two
|
||||||
|
elif self.operation == MathOperation.MULTIPLY:
|
||||||
|
return self.number_one * self.number_two
|
||||||
|
elif self.operation == MathOperation.DIVIDE:
|
||||||
|
return self.number_one / self.number_two
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown operation.")
|
||||||
|
|
||||||
|
|
||||||
|
# Example function to get the weather
|
||||||
|
def get_current_weather(location, unit):
|
||||||
|
"""Get the current weather in a given location"""
|
||||||
|
if "London" in location:
|
||||||
|
return json.dumps({"location": "London", "temperature": "42", "unit": unit.value})
|
||||||
|
elif "New York" in location:
|
||||||
|
return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
|
||||||
|
elif "North Pole" in location:
|
||||||
|
return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
|
||||||
|
else:
|
||||||
|
return json.dumps({"location": location, "temperature": "unknown"})
|
||||||
|
|
||||||
|
|
||||||
|
# Here is a function definition in OpenAI style
|
||||||
|
current_weather_tool = {
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "get_current_weather",
|
||||||
|
"description": "Get the current weather in a given location",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The city and state, e.g. San Francisco, CA",
|
||||||
|
},
|
||||||
|
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
||||||
|
},
|
||||||
|
"required": ["location"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Convert OpenAI function definition into pydantic model
|
||||||
|
current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
|
||||||
|
# Add the actual function to a pydantic model
|
||||||
|
current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
|
||||||
|
|
||||||
|
# Convert normal Python function to a pydantic model
|
||||||
|
current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
|
||||||
|
|
||||||
|
tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
|
||||||
|
|
||||||
|
|
||||||
|
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
|
||||||
|
pydantic_model_list=tool_list, outer_object_name="function",
|
||||||
|
outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
|
||||||
|
|
||||||
|
system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
|
||||||
|
|
||||||
|
|
||||||
|
text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
|
||||||
|
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
|
||||||
|
|
||||||
|
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
|
||||||
|
|
||||||
|
json_data = json.loads(text)
|
||||||
|
|
||||||
|
print(json_data)
|
||||||
|
# Should output something like this:
|
||||||
|
# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}]
|
||||||
|
|
||||||
|
|
||||||
|
for call in json_data:
|
||||||
|
if call["function"] == "Calculator":
|
||||||
|
print(Calculator(**call["params"]).run())
|
||||||
|
elif call["function"] == "get_current_datetime":
|
||||||
|
print(current_datetime_model(**call["params"]).run())
|
||||||
|
elif call["function"] == "get_current_weather":
|
||||||
|
print(current_weather_tool_model(**call["params"]).run())
|
||||||
|
# Should output something like this:
|
||||||
|
# 2024-01-14 13:36:06
|
||||||
|
# {"location": "London", "temperature": "42", "unit": "celsius"}
|
||||||
|
# 1764
|
||||||
|
|
|
@ -5,6 +5,10 @@
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <fstream>
|
||||||
|
#include <cmath>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
struct quant_option {
|
struct quant_option {
|
||||||
std::string name;
|
std::string name;
|
||||||
|
@ -17,6 +21,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
|
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
|
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
|
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
|
||||||
|
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
|
||||||
|
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
|
||||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
|
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||||
|
@ -72,10 +78,14 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
||||||
//
|
//
|
||||||
[[noreturn]]
|
[[noreturn]]
|
||||||
static void usage(const char * executable) {
|
static void usage(const char * executable) {
|
||||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||||
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||||
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||||
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
|
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
|
||||||
|
printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
|
||||||
|
printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
||||||
|
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
||||||
|
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
|
||||||
printf("\nAllowed quantization types:\n");
|
printf("\nAllowed quantization types:\n");
|
||||||
for (auto & it : QUANT_OPTIONS) {
|
for (auto & it : QUANT_OPTIONS) {
|
||||||
if (it.name != "COPY") {
|
if (it.name != "COPY") {
|
||||||
|
@ -83,11 +93,93 @@ static void usage(const char * executable) {
|
||||||
} else {
|
} else {
|
||||||
printf(" ");
|
printf(" ");
|
||||||
}
|
}
|
||||||
printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str());
|
printf("%-7s : %s\n", it.name.c_str(), it.desc.c_str());
|
||||||
}
|
}
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
|
||||||
|
std::ifstream in(imatrix_file.c_str(), std::ios::binary);
|
||||||
|
if (!in) {
|
||||||
|
printf("%s: failed to open %s\n",__func__,imatrix_file.c_str());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
int n_entries;
|
||||||
|
in.read((char*)&n_entries, sizeof(n_entries));
|
||||||
|
if (in.fail() || n_entries < 1) {
|
||||||
|
printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < n_entries; ++i) {
|
||||||
|
int len; in.read((char *)&len, sizeof(len));
|
||||||
|
std::vector<char> name_as_vec(len+1);
|
||||||
|
in.read((char *)name_as_vec.data(), len);
|
||||||
|
if (in.fail()) {
|
||||||
|
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file.c_str());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
name_as_vec[len] = 0;
|
||||||
|
std::string name{name_as_vec.data()};
|
||||||
|
auto& e = imatrix_data[std::move(name)];
|
||||||
|
int ncall;
|
||||||
|
in.read((char*)&ncall, sizeof(ncall));
|
||||||
|
int nval;
|
||||||
|
in.read((char *)&nval, sizeof(nval));
|
||||||
|
if (in.fail() || nval < 1) {
|
||||||
|
printf("%s: failed reading number of values for entry %d\n",__func__,i);
|
||||||
|
imatrix_data = {};
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
e.resize(nval);
|
||||||
|
in.read((char*)e.data(), nval*sizeof(float));
|
||||||
|
if (in.fail()) {
|
||||||
|
printf("%s: failed reading data for entry %d\n",__func__,i);
|
||||||
|
imatrix_data = {};
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (ncall > 0) {
|
||||||
|
for (auto& v : e) v /= ncall;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("%s: loaded %d importance matrix entries from %s\n",__func__,int(imatrix_data.size()),imatrix_file.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
static void prepare_imatrix(const std::string& imatrix_file,
|
||||||
|
const std::vector<std::string>& included_weights,
|
||||||
|
const std::vector<std::string>& excluded_weights,
|
||||||
|
std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
|
||||||
|
if (!imatrix_file.empty()) {
|
||||||
|
load_imatrix(imatrix_file, imatrix_data);
|
||||||
|
}
|
||||||
|
if (imatrix_data.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!excluded_weights.empty()) {
|
||||||
|
for (auto& name : excluded_weights) {
|
||||||
|
for (auto it = imatrix_data.begin(); it != imatrix_data.end(); ) {
|
||||||
|
auto pos = it->first.find(name);
|
||||||
|
if (pos != std::string::npos) it = imatrix_data.erase(it);
|
||||||
|
else ++it;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!included_weights.empty()) {
|
||||||
|
std::unordered_map<std::string, std::vector<float>> tmp;
|
||||||
|
for (auto& name : included_weights) {
|
||||||
|
for (auto& e : imatrix_data) {
|
||||||
|
auto pos = e.first.find(name);
|
||||||
|
if (pos != std::string::npos) {
|
||||||
|
tmp.emplace(std::move(e));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
imatrix_data = std::move(tmp);
|
||||||
|
}
|
||||||
|
if (!imatrix_data.empty()) {
|
||||||
|
printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
if (argc < 3) {
|
if (argc < 3) {
|
||||||
usage(argv[0]);
|
usage(argv[0]);
|
||||||
|
@ -96,6 +188,8 @@ int main(int argc, char ** argv) {
|
||||||
llama_model_quantize_params params = llama_model_quantize_default_params();
|
llama_model_quantize_params params = llama_model_quantize_default_params();
|
||||||
|
|
||||||
int arg_idx = 1;
|
int arg_idx = 1;
|
||||||
|
std::string imatrix_file;
|
||||||
|
std::vector<std::string> included_weights, excluded_weights;
|
||||||
|
|
||||||
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
|
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
|
||||||
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
|
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
|
||||||
|
@ -104,14 +198,42 @@ int main(int argc, char ** argv) {
|
||||||
params.allow_requantize = true;
|
params.allow_requantize = true;
|
||||||
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
|
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
|
||||||
params.pure = true;
|
params.pure = true;
|
||||||
|
} else if (strcmp(argv[arg_idx], "--imatrix") == 0) {
|
||||||
|
if (arg_idx < argc-1) {
|
||||||
|
imatrix_file = argv[++arg_idx];
|
||||||
|
} else {
|
||||||
|
usage(argv[0]);
|
||||||
|
}
|
||||||
|
} else if (strcmp(argv[arg_idx], "--include-weights") == 0) {
|
||||||
|
if (arg_idx < argc-1) {
|
||||||
|
included_weights.push_back(argv[++arg_idx]);
|
||||||
|
} else {
|
||||||
|
usage(argv[0]);
|
||||||
|
}
|
||||||
|
} else if (strcmp(argv[arg_idx], "--exclude-weights") == 0) {
|
||||||
|
if (arg_idx < argc-1) {
|
||||||
|
excluded_weights.push_back(argv[++arg_idx]);
|
||||||
|
} else {
|
||||||
|
usage(argv[0]);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
usage(argv[0]);
|
usage(argv[0]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argc - arg_idx < 2) {
|
if (argc - arg_idx < 2) {
|
||||||
|
printf("%s: bad arguments\n", argv[0]);
|
||||||
usage(argv[0]);
|
usage(argv[0]);
|
||||||
}
|
}
|
||||||
|
if (!included_weights.empty() && !excluded_weights.empty()) {
|
||||||
|
usage(argv[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unordered_map<std::string, std::vector<float>> imatrix_data;
|
||||||
|
prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data);
|
||||||
|
if (!imatrix_data.empty()) {
|
||||||
|
params.imatrix = &imatrix_data;
|
||||||
|
}
|
||||||
|
|
||||||
llama_backend_init(false);
|
llama_backend_init(false);
|
||||||
|
|
||||||
|
@ -163,6 +285,13 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) && imatrix_data.empty()) {
|
||||||
|
fprintf(stderr, "\n===============================================================================================\n");
|
||||||
|
fprintf(stderr, "Please do not use IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
|
||||||
|
fprintf(stderr, "===============================================================================================\n\n\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
print_build_info();
|
print_build_info();
|
||||||
|
|
||||||
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
|
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
|
||||||
|
|
|
@ -1180,8 +1180,9 @@ struct llama_server_context
|
||||||
return slot.images.size() > 0;
|
return slot.images.size() > 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void send_error(task_server& task, std::string error)
|
void send_error(task_server& task, const std::string &error)
|
||||||
{
|
{
|
||||||
|
LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
|
||||||
std::unique_lock<std::mutex> lock(mutex_results);
|
std::unique_lock<std::mutex> lock(mutex_results);
|
||||||
task_result res;
|
task_result res;
|
||||||
res.id = task.id;
|
res.id = task.id;
|
||||||
|
@ -1557,6 +1558,7 @@ struct llama_server_context
|
||||||
void process_tasks()
|
void process_tasks()
|
||||||
{
|
{
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
|
std::vector<task_server> deferred_tasks;
|
||||||
while (!queue_tasks.empty())
|
while (!queue_tasks.empty())
|
||||||
{
|
{
|
||||||
task_server task = queue_tasks.front();
|
task_server task = queue_tasks.front();
|
||||||
|
@ -1567,15 +1569,24 @@ struct llama_server_context
|
||||||
llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
|
llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
|
||||||
if (slot == nullptr)
|
if (slot == nullptr)
|
||||||
{
|
{
|
||||||
LOG_TEE("slot unavailable\n");
|
// if no slot is available, we defer this task for processing later
|
||||||
// send error result
|
deferred_tasks.push_back(task);
|
||||||
send_error(task, "slot unavailable");
|
break;
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (task.data.contains("system_prompt"))
|
if (task.data.contains("system_prompt"))
|
||||||
{
|
{
|
||||||
|
if (!all_slots_are_idle) {
|
||||||
|
send_error(task, "system prompt can only be updated when all slots are idle");
|
||||||
|
break;
|
||||||
|
}
|
||||||
process_system_prompt_data(task.data["system_prompt"]);
|
process_system_prompt_data(task.data["system_prompt"]);
|
||||||
|
|
||||||
|
// reset cache_tokens for all slots
|
||||||
|
for (llama_client_slot &slot : slots)
|
||||||
|
{
|
||||||
|
slot.cache_tokens.clear();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
slot->reset();
|
slot->reset();
|
||||||
|
@ -1605,6 +1616,12 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// add all the deferred tasks back the the queue
|
||||||
|
for (task_server &task : deferred_tasks)
|
||||||
|
{
|
||||||
|
queue_tasks.push_back(task);
|
||||||
|
}
|
||||||
|
|
||||||
// remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue
|
// remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue
|
||||||
std::vector<task_result> agg_results;
|
std::vector<task_result> agg_results;
|
||||||
auto queue_iterator = queue_multitasks.begin();
|
auto queue_iterator = queue_multitasks.begin();
|
||||||
|
@ -1652,8 +1669,7 @@ struct llama_server_context
|
||||||
// attend tasks
|
// attend tasks
|
||||||
process_tasks();
|
process_tasks();
|
||||||
|
|
||||||
// update the system prompt wait until all slots are idle state
|
if (system_need_update)
|
||||||
if (system_need_update && all_slots_are_idle)
|
|
||||||
{
|
{
|
||||||
LOG_TEE("updating system prompt\n");
|
LOG_TEE("updating system prompt\n");
|
||||||
update_system_prompt();
|
update_system_prompt();
|
||||||
|
|
|
@ -65,6 +65,10 @@ int main(int argc, char ** argv) {
|
||||||
// load the draft model
|
// load the draft model
|
||||||
params.model = params.model_draft;
|
params.model = params.model_draft;
|
||||||
params.n_gpu_layers = params.n_gpu_layers_draft;
|
params.n_gpu_layers = params.n_gpu_layers_draft;
|
||||||
|
if (params.n_threads_draft > 0) {
|
||||||
|
params.n_threads = params.n_threads_draft;
|
||||||
|
}
|
||||||
|
params.n_threads_batch = params.n_threads_batch_draft;
|
||||||
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
{
|
{
|
||||||
|
|
|
@ -263,7 +263,6 @@ static void init_model(struct my_llama_model * model) {
|
||||||
model->data.resize(size + tensor_alignment);
|
model->data.resize(size + tensor_alignment);
|
||||||
alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
|
alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
|
||||||
alloc_model(alloc, model);
|
alloc_model(alloc, model);
|
||||||
ggml_allocr_free(alloc);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
|
static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
|
||||||
|
@ -1102,7 +1101,6 @@ int main(int argc, char ** argv) {
|
||||||
alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
|
alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
|
||||||
ggml_allocr_alloc(alloc, tokens_input);
|
ggml_allocr_alloc(alloc, tokens_input);
|
||||||
ggml_allocr_alloc(alloc, target_probs);
|
ggml_allocr_alloc(alloc, target_probs);
|
||||||
ggml_allocr_free(alloc);
|
|
||||||
|
|
||||||
// context for compute tensors without their data
|
// context for compute tensors without their data
|
||||||
const size_t estimated_compute_size_wo_data = (
|
const size_t estimated_compute_size_wo_data = (
|
||||||
|
@ -1149,7 +1147,6 @@ int main(int argc, char ** argv) {
|
||||||
best_compute_size = max_compute_size;
|
best_compute_size = max_compute_size;
|
||||||
best_order = gf->order;
|
best_order = gf->order;
|
||||||
}
|
}
|
||||||
ggml_allocr_free(alloc);
|
|
||||||
ggml_free(ctx_compute);
|
ggml_free(ctx_compute);
|
||||||
}
|
}
|
||||||
size_t max_compute_size = best_compute_size;
|
size_t max_compute_size = best_compute_size;
|
||||||
|
@ -1177,7 +1174,6 @@ int main(int argc, char ** argv) {
|
||||||
params.common.use_flash,
|
params.common.use_flash,
|
||||||
params.common.use_checkpointing
|
params.common.use_checkpointing
|
||||||
);
|
);
|
||||||
ggml_allocr_free(alloc);
|
|
||||||
|
|
||||||
std::vector<llama_token> train_tokens;
|
std::vector<llama_token> train_tokens;
|
||||||
std::vector<size_t> train_samples_begin;
|
std::vector<size_t> train_samples_begin;
|
||||||
|
|
18
flake.lock
generated
|
@ -5,11 +5,11 @@
|
||||||
"nixpkgs-lib": "nixpkgs-lib"
|
"nixpkgs-lib": "nixpkgs-lib"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1701473968,
|
"lastModified": 1704982712,
|
||||||
"narHash": "sha256-YcVE5emp1qQ8ieHUnxt1wCZCC3ZfAS+SRRWZ2TMda7E=",
|
"narHash": "sha256-2Ptt+9h8dczgle2Oo6z5ni5rt/uLMG47UFTR1ry/wgg=",
|
||||||
"owner": "hercules-ci",
|
"owner": "hercules-ci",
|
||||||
"repo": "flake-parts",
|
"repo": "flake-parts",
|
||||||
"rev": "34fed993f1674c8d06d58b37ce1e0fe5eebcb9f5",
|
"rev": "07f6395285469419cf9d078f59b5b49993198c00",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1703637592,
|
"lastModified": 1705133751,
|
||||||
"narHash": "sha256-8MXjxU0RfFfzl57Zy3OfXCITS0qWDNLzlBAdwxGZwfY=",
|
"narHash": "sha256-rCIsyE80jgiOU78gCWN3A0wE0tR2GI5nH6MlS+HaaSQ=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "cfc3698c31b1fb9cdcf10f36c9643460264d0ca8",
|
"rev": "9b19f5e77dd906cb52dade0b7bd280339d2a1f3d",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -37,11 +37,11 @@
|
||||||
"nixpkgs-lib": {
|
"nixpkgs-lib": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"dir": "lib",
|
"dir": "lib",
|
||||||
"lastModified": 1701253981,
|
"lastModified": 1703961334,
|
||||||
"narHash": "sha256-ztaDIyZ7HrTAfEEUt9AtTDNoCYxUdSd6NrRHaYOIxtk=",
|
"narHash": "sha256-M1mV/Cq+pgjk0rt6VxoyyD+O8cOUiai8t9Q6Yyq4noY=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "e92039b55bcd58469325ded85d4f58dd5a4eaf58",
|
"rev": "b0d36bd0a420ecee3bc916c91886caca87c894e9",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
57
flake.nix
|
@ -6,28 +6,41 @@
|
||||||
flake-parts.url = "github:hercules-ci/flake-parts";
|
flake-parts.url = "github:hercules-ci/flake-parts";
|
||||||
};
|
};
|
||||||
|
|
||||||
# Optional binary cache
|
# There's an optional binary cache available. The details are below, but they're commented out.
|
||||||
nixConfig = {
|
#
|
||||||
extra-substituters = [
|
# Why? The terrible experience of being prompted to accept them on every single Nix command run.
|
||||||
# Populated by the CI in ggerganov/llama.cpp
|
# Plus, there are warnings shown about not being a trusted user on a default Nix install
|
||||||
"https://llama-cpp.cachix.org"
|
# if you *do* say yes to the prompts.
|
||||||
|
#
|
||||||
# A development cache for nixpkgs imported with `config.cudaSupport = true`.
|
# This experience makes having `nixConfig` in a flake a persistent UX problem.
|
||||||
# Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
|
#
|
||||||
# This lets one skip building e.g. the CUDA-enabled openmpi.
|
# To make use of the binary cache, please add the relevant settings to your `nix.conf`.
|
||||||
# TODO: Replace once nix-community obtains an official one.
|
# It's located at `/etc/nix/nix.conf` on non-NixOS systems. On NixOS, adjust the `nix.settings`
|
||||||
"https://cuda-maintainers.cachix.org"
|
# option in your NixOS configuration to add `extra-substituters` and `extra-trusted-public-keys`,
|
||||||
];
|
# as shown below.
|
||||||
|
#
|
||||||
# Verify these are the same keys as published on
|
# ```
|
||||||
# - https://app.cachix.org/cache/llama-cpp
|
# nixConfig = {
|
||||||
# - https://app.cachix.org/cache/cuda-maintainers
|
# extra-substituters = [
|
||||||
extra-trusted-public-keys = [
|
# # Populated by the CI in ggerganov/llama.cpp
|
||||||
"llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
|
# "https://llama-cpp.cachix.org"
|
||||||
"cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
|
#
|
||||||
];
|
# # A development cache for nixpkgs imported with `config.cudaSupport = true`.
|
||||||
};
|
# # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
|
||||||
|
# # This lets one skip building e.g. the CUDA-enabled openmpi.
|
||||||
|
# # TODO: Replace once nix-community obtains an official one.
|
||||||
|
# "https://cuda-maintainers.cachix.org"
|
||||||
|
# ];
|
||||||
|
#
|
||||||
|
# # Verify these are the same keys as published on
|
||||||
|
# # - https://app.cachix.org/cache/llama-cpp
|
||||||
|
# # - https://app.cachix.org/cache/cuda-maintainers
|
||||||
|
# extra-trusted-public-keys = [
|
||||||
|
# "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
|
||||||
|
# "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
|
||||||
|
# ];
|
||||||
|
# };
|
||||||
|
# ```
|
||||||
|
|
||||||
# For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl:
|
# For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl:
|
||||||
#
|
#
|
||||||
|
|
|
@ -16,14 +16,14 @@ extern "C" {
|
||||||
typedef void * ggml_backend_buffer_type_context_t;
|
typedef void * ggml_backend_buffer_type_context_t;
|
||||||
|
|
||||||
struct ggml_backend_buffer_type_i {
|
struct ggml_backend_buffer_type_i {
|
||||||
const char * (*get_name) (ggml_backend_buffer_type_t buft);
|
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
||||||
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
||||||
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
||||||
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
||||||
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
||||||
// check if tensor data is in host memory
|
// check if tensor data is in host memory
|
||||||
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
||||||
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_buffer_type {
|
struct ggml_backend_buffer_type {
|
||||||
|
@ -35,15 +35,15 @@ extern "C" {
|
||||||
typedef void * ggml_backend_buffer_context_t;
|
typedef void * ggml_backend_buffer_context_t;
|
||||||
|
|
||||||
struct ggml_backend_buffer_i {
|
struct ggml_backend_buffer_i {
|
||||||
const char * (*get_name) (ggml_backend_buffer_t buffer);
|
const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
|
||||||
void (*free_buffer)(ggml_backend_buffer_t buffer);
|
void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
|
||||||
void * (*get_base) (ggml_backend_buffer_t buffer);
|
void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
|
||||||
void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
|
bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
|
||||||
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
||||||
void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_buffer {
|
struct ggml_backend_buffer {
|
||||||
|
@ -54,7 +54,7 @@ extern "C" {
|
||||||
enum ggml_backend_buffer_usage usage;
|
enum ggml_backend_buffer_usage usage;
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_buffer_init(
|
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||||
ggml_backend_buffer_type_t buft,
|
ggml_backend_buffer_type_t buft,
|
||||||
struct ggml_backend_buffer_i iface,
|
struct ggml_backend_buffer_i iface,
|
||||||
ggml_backend_buffer_context_t context,
|
ggml_backend_buffer_context_t context,
|
||||||
|
@ -70,12 +70,12 @@ extern "C" {
|
||||||
typedef void * ggml_backend_context_t;
|
typedef void * ggml_backend_context_t;
|
||||||
|
|
||||||
struct ggml_backend_i {
|
struct ggml_backend_i {
|
||||||
const char * (*get_name)(ggml_backend_t backend);
|
const char * (*GGML_CALL get_name)(ggml_backend_t backend);
|
||||||
|
|
||||||
void (*free)(ggml_backend_t backend);
|
void (*GGML_CALL free)(ggml_backend_t backend);
|
||||||
|
|
||||||
// buffer allocation
|
// buffer allocation
|
||||||
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
|
ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
|
||||||
|
|
||||||
// (optional) asynchronous tensor data access
|
// (optional) asynchronous tensor data access
|
||||||
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
|
@ -83,18 +83,18 @@ extern "C" {
|
||||||
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
|
||||||
// (optional) complete all pending operations
|
// (optional) complete all pending operations
|
||||||
void (*synchronize)(ggml_backend_t backend);
|
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
||||||
|
|
||||||
// compute graph with a plan
|
// compute graph with a plan
|
||||||
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
||||||
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
|
|
||||||
// compute graph without a plan (async)
|
// compute graph without a plan (async)
|
||||||
bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
// check if the backend supports an operation
|
// check if the backend supports an operation
|
||||||
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend {
|
struct ggml_backend {
|
||||||
|
@ -107,9 +107,9 @@ extern "C" {
|
||||||
// Backend registry
|
// Backend registry
|
||||||
//
|
//
|
||||||
|
|
||||||
typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
|
typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
|
||||||
|
|
||||||
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
151
ggml-backend.c
|
@ -19,7 +19,7 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
||||||
return buft->iface.get_name(buft);
|
return buft->iface.get_name(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
return buft->iface.alloc_buffer(buft, size);
|
return buft->iface.alloc_buffer(buft, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||||
return buft->iface.get_alignment(buft);
|
return buft->iface.get_alignment(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
||||||
// get_alloc_size is optional, defaults to ggml_nbytes
|
// get_alloc_size is optional, defaults to ggml_nbytes
|
||||||
if (buft->iface.get_alloc_size) {
|
if (buft->iface.get_alloc_size) {
|
||||||
return buft->iface.get_alloc_size(buft, tensor);
|
return buft->iface.get_alloc_size(buft, tensor);
|
||||||
|
@ -48,7 +48,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
|
|
||||||
// backend buffer
|
// backend buffer
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_buffer_init(
|
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||||
ggml_backend_buffer_type_t buft,
|
ggml_backend_buffer_type_t buft,
|
||||||
struct ggml_backend_buffer_i iface,
|
struct ggml_backend_buffer_i iface,
|
||||||
ggml_backend_buffer_context_t context,
|
ggml_backend_buffer_context_t context,
|
||||||
|
@ -95,7 +95,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
return base;
|
return base;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||||
// init_tensor is optional
|
// init_tensor is optional
|
||||||
if (buffer->iface.init_tensor) {
|
if (buffer->iface.init_tensor) {
|
||||||
buffer->iface.init_tensor(buffer, tensor);
|
buffer->iface.init_tensor(buffer, tensor);
|
||||||
|
@ -191,7 +191,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||||
|
|
||||||
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
||||||
|
@ -201,7 +201,7 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz
|
||||||
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||||
|
|
||||||
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
||||||
|
@ -318,9 +318,9 @@ struct ggml_backend_reg {
|
||||||
static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
|
static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
|
||||||
static size_t ggml_backend_registry_count = 0;
|
static size_t ggml_backend_registry_count = 0;
|
||||||
|
|
||||||
static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
||||||
|
|
||||||
static void ggml_backend_registry_init(void) {
|
GGML_CALL static void ggml_backend_registry_init(void) {
|
||||||
static bool initialized = false;
|
static bool initialized = false;
|
||||||
|
|
||||||
if (initialized) {
|
if (initialized) {
|
||||||
|
@ -333,18 +333,18 @@ static void ggml_backend_registry_init(void) {
|
||||||
|
|
||||||
// add forward decls here to avoid including the backend headers
|
// add forward decls here to avoid including the backend headers
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
extern void ggml_backend_cuda_reg_devices(void);
|
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
|
||||||
ggml_backend_cuda_reg_devices();
|
ggml_backend_cuda_reg_devices();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
extern ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
||||||
extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||||
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
|
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
||||||
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
|
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
|
||||||
|
|
||||||
size_t id = ggml_backend_registry_count;
|
size_t id = ggml_backend_registry_count;
|
||||||
|
@ -439,33 +439,33 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
|
||||||
|
|
||||||
// backend CPU
|
// backend CPU
|
||||||
|
|
||||||
static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
||||||
return "CPU";
|
return "CPU";
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
return (void *)buffer->context;
|
return (void *)buffer->context;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
free(buffer->context);
|
free(buffer->context);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
memcpy((char *)tensor->data + offset, data, size);
|
memcpy((char *)tensor->data + offset, data, size);
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
memcpy(data, (const char *)tensor->data + offset, size);
|
memcpy(data, (const char *)tensor->data + offset, size);
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||||
if (ggml_backend_buffer_is_host(src->buffer)) {
|
if (ggml_backend_buffer_is_host(src->buffer)) {
|
||||||
memcpy(dst->data, src->data, ggml_nbytes(src));
|
memcpy(dst->data, src->data, ggml_nbytes(src));
|
||||||
return true;
|
return true;
|
||||||
|
@ -475,7 +475,7 @@ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
|
||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
memset(buffer->context, value, buffer->size);
|
memset(buffer->context, value, buffer->size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -506,13 +506,13 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
||||||
|
|
||||||
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
||||||
|
|
||||||
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||||
return "CPU";
|
return "CPU";
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
||||||
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
||||||
|
|
||||||
|
@ -521,25 +521,25 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
|
||||||
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
|
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||||
return TENSOR_ALIGNMENT;
|
return TENSOR_ALIGNMENT;
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||||
return ggml_backend_is_cpu(backend);
|
return ggml_backend_is_cpu(backend);
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
||||||
/* .iface = */ {
|
/* .iface = */ {
|
||||||
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
||||||
|
@ -561,23 +561,23 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
||||||
|
|
||||||
#include <hbwmalloc.h>
|
#include <hbwmalloc.h>
|
||||||
|
|
||||||
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||||
return "CPU_HBM";
|
return "CPU_HBM";
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
||||||
return "CPU_HBM";
|
return "CPU_HBM";
|
||||||
|
|
||||||
GGML_UNUSED(buf);
|
GGML_UNUSED(buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
hbw_free(buffer->context);
|
hbw_free(buffer->context);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
//void * ptr = hbw_malloc(size);
|
//void * ptr = hbw_malloc(size);
|
||||||
void * ptr;
|
void * ptr;
|
||||||
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
||||||
|
@ -617,20 +617,20 @@ struct ggml_backend_cpu_context {
|
||||||
size_t work_size;
|
size_t work_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
||||||
return "CPU";
|
return "CPU";
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
||||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||||
free(cpu_ctx->work_data);
|
free(cpu_ctx->work_data);
|
||||||
free(cpu_ctx);
|
free(cpu_ctx);
|
||||||
free(backend);
|
free(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
||||||
return ggml_backend_cpu_buffer_type();
|
return ggml_backend_cpu_buffer_type();
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
|
@ -641,7 +641,7 @@ struct ggml_backend_plan_cpu {
|
||||||
struct ggml_cgraph cgraph;
|
struct ggml_cgraph cgraph;
|
||||||
};
|
};
|
||||||
|
|
||||||
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
||||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||||
|
|
||||||
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
||||||
|
@ -656,7 +656,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
|
||||||
return cpu_plan;
|
return cpu_plan;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||||
|
|
||||||
free(cpu_plan->cplan.work_data);
|
free(cpu_plan->cplan.work_data);
|
||||||
|
@ -665,7 +665,7 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backen
|
||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||||
|
|
||||||
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
||||||
|
@ -673,7 +673,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
|
||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||||
|
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
||||||
|
@ -690,8 +690,10 @@ static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
switch (op->op) {
|
switch (op->op) {
|
||||||
|
case GGML_OP_CPY:
|
||||||
|
return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
||||||
default:
|
default:
|
||||||
|
@ -732,7 +734,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
||||||
return cpu_backend;
|
return cpu_backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
||||||
return backend && backend->iface.get_name == ggml_backend_cpu_name;
|
return backend && backend->iface.get_name == ggml_backend_cpu_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -743,11 +745,11 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
||||||
ctx->n_threads = n_threads;
|
ctx->n_threads = n_threads;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
||||||
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
|
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
|
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
|
||||||
return ggml_backend_cpu_init();
|
return ggml_backend_cpu_init();
|
||||||
|
|
||||||
GGML_UNUSED(params);
|
GGML_UNUSED(params);
|
||||||
|
@ -802,6 +804,9 @@ struct ggml_backend_sched {
|
||||||
__attribute__((aligned(GGML_MEM_ALIGN)))
|
__attribute__((aligned(GGML_MEM_ALIGN)))
|
||||||
#endif
|
#endif
|
||||||
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
||||||
|
|
||||||
|
ggml_backend_sched_eval_callback callback_eval;
|
||||||
|
void * callback_eval_user_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
|
#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
|
||||||
|
@ -1186,6 +1191,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
ggml_tallocr_t src_allocr = node_allocr(src);
|
ggml_tallocr_t src_allocr = node_allocr(src);
|
||||||
GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
|
GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
|
||||||
if (src_allocr != node_allocr) {
|
if (src_allocr != node_allocr) {
|
||||||
|
// create a copy of the input in the split's backend
|
||||||
|
size_t id = hash_id(src);
|
||||||
|
if (sched->node_copies[id][cur_backend_id] == NULL) {
|
||||||
|
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
|
||||||
|
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
||||||
|
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
||||||
|
|
||||||
|
sched->node_copies[id][cur_backend_id] = tensor_copy;
|
||||||
|
node_allocr(tensor_copy) = cur_allocr;
|
||||||
|
SET_CAUSE(tensor_copy, "4.cpy");
|
||||||
|
|
||||||
|
int n_inputs = sched->splits[cur_split].n_inputs++;
|
||||||
|
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
||||||
|
sched->splits[cur_split].inputs[n_inputs] = src;
|
||||||
|
}
|
||||||
|
node->src[j] = sched->node_copies[id][cur_backend_id];
|
||||||
|
|
||||||
|
#if 0
|
||||||
// check if the input is already in the split
|
// check if the input is already in the split
|
||||||
bool found = false;
|
bool found = false;
|
||||||
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
|
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
|
||||||
|
@ -1201,19 +1224,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
||||||
sched->splits[cur_split].inputs[n_inputs] = src;
|
sched->splits[cur_split].inputs[n_inputs] = src;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
// create a copy of the input in the split's backend
|
|
||||||
size_t id = hash_id(src);
|
|
||||||
if (sched->node_copies[id][cur_backend_id] == NULL) {
|
|
||||||
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
|
|
||||||
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
|
||||||
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
|
||||||
|
|
||||||
sched->node_copies[id][cur_backend_id] = tensor_copy;
|
|
||||||
node_allocr(tensor_copy) = cur_allocr;
|
|
||||||
SET_CAUSE(tensor_copy, "4.cpy");
|
|
||||||
}
|
|
||||||
node->src[j] = sched->node_copies[id][cur_backend_id];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1325,9 +1336,38 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
||||||
ggml_graph_dump_dot(split->graph, NULL, split_filename);
|
ggml_graph_dump_dot(split->graph, NULL, split_filename);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
uint64_t compute_start_us = ggml_time_us();
|
uint64_t compute_start_us = ggml_time_us();
|
||||||
|
if (!sched->callback_eval) {
|
||||||
ggml_backend_graph_compute(split_backend, &split->graph);
|
ggml_backend_graph_compute(split_backend, &split->graph);
|
||||||
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
||||||
|
} else {
|
||||||
|
// similar to ggml_backend_compare_graph_backend
|
||||||
|
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
|
||||||
|
struct ggml_tensor * t = split->graph.nodes[j0];
|
||||||
|
|
||||||
|
// check if the user needs data from this node
|
||||||
|
bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
||||||
|
|
||||||
|
int j1 = j0;
|
||||||
|
|
||||||
|
// determine the range [j0, j1] of nodes that can be computed together
|
||||||
|
while (!need && j1 < split->graph.n_nodes - 1) {
|
||||||
|
t = split->graph.nodes[++j1];
|
||||||
|
need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
||||||
|
|
||||||
|
ggml_backend_graph_compute(split_backend, &gv);
|
||||||
|
|
||||||
|
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
j0 = j1;
|
||||||
|
}
|
||||||
|
}
|
||||||
uint64_t compute_end_us = ggml_time_us();
|
uint64_t compute_end_us = ggml_time_us();
|
||||||
compute_us[split_backend_id] += compute_end_us - compute_start_us;
|
compute_us[split_backend_id] += compute_end_us - compute_start_us;
|
||||||
}
|
}
|
||||||
|
@ -1438,6 +1478,11 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
||||||
|
sched->callback_eval = callback;
|
||||||
|
sched->callback_eval_user_data = user_data;
|
||||||
|
}
|
||||||
|
|
||||||
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
||||||
return sched->n_splits;
|
return sched->n_splits;
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,9 +18,9 @@ extern "C" {
|
||||||
|
|
||||||
// buffer type
|
// buffer type
|
||||||
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||||
GGML_API size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
||||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ extern "C" {
|
||||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||||
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||||
|
@ -58,8 +58,8 @@ extern "C" {
|
||||||
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
|
|
||||||
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
|
|
||||||
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
||||||
|
|
||||||
|
@ -80,13 +80,13 @@ extern "C" {
|
||||||
|
|
||||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
|
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||||
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
||||||
|
|
||||||
// Create a backend buffer from an existing pointer
|
// Create a backend buffer from an existing pointer
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||||
|
|
||||||
#ifdef GGML_USE_CPU_HBM
|
#ifdef GGML_USE_CPU_HBM
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||||
|
@ -148,6 +148,14 @@ extern "C" {
|
||||||
struct ggml_backend_sched;
|
struct ggml_backend_sched;
|
||||||
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
||||||
|
|
||||||
|
// when ask == true, the scheduler wants to know if the user wants to observe this node
|
||||||
|
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
|
||||||
|
//
|
||||||
|
// when ask == false, the scheduler is passing the node tensor to the user for observation
|
||||||
|
// if the user returns false, the scheduler will cancel the graph compute
|
||||||
|
//
|
||||||
|
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
||||||
|
|
||||||
// Initialize a backend scheduler
|
// Initialize a backend scheduler
|
||||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
||||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||||
|
@ -171,6 +179,9 @@ extern "C" {
|
||||||
// Synchronize all backends
|
// Synchronize all backends
|
||||||
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
|
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
|
// Set a callback to be called for each resulting node during graph compute
|
||||||
|
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Utils
|
// Utils
|
||||||
//
|
//
|
||||||
|
@ -186,7 +197,7 @@ extern "C" {
|
||||||
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
||||||
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
||||||
|
|
||||||
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
||||||
|
|
||||||
// Compare the output of two backends
|
// Compare the output of two backends
|
||||||
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
||||||
|
|
223
ggml-cuda.cu
|
@ -12,9 +12,6 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <array>
|
#include <array>
|
||||||
#include "ggml-cuda.h"
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-backend-impl.h"
|
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
#if defined(GGML_USE_HIPBLAS)
|
||||||
#include <hip/hip_runtime.h>
|
#include <hip/hip_runtime.h>
|
||||||
|
@ -118,6 +115,11 @@
|
||||||
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS)
|
#endif // defined(GGML_USE_HIPBLAS)
|
||||||
|
|
||||||
|
// ggml-cuda need half type so keep ggml headers include at last
|
||||||
|
#include "ggml-cuda.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
||||||
|
|
||||||
#define CC_PASCAL 600
|
#define CC_PASCAL 600
|
||||||
|
@ -1105,6 +1107,61 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
||||||
#endif // GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename dst_t>
|
||||||
|
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
||||||
|
|
||||||
|
const int i = blockIdx.x;
|
||||||
|
|
||||||
|
// assume 32 threads
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
const int il = tid/8;
|
||||||
|
const int ir = tid%8;
|
||||||
|
const int ib = 8*i + ir;
|
||||||
|
if (ib >= nb32) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
dst_t * y = yy + 256*i + 32*ir + 4*il;
|
||||||
|
|
||||||
|
const block_q4_0 * x = (const block_q4_0 *)vx + ib;
|
||||||
|
const float d = __half2float(x->d);
|
||||||
|
const float dm = -8*d;
|
||||||
|
|
||||||
|
const uint8_t * q = x->qs + 4*il;
|
||||||
|
|
||||||
|
for (int l = 0; l < 4; ++l) {
|
||||||
|
y[l+ 0] = d * (q[l] & 0xF) + dm;
|
||||||
|
y[l+16] = d * (q[l] >> 4) + dm;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename dst_t>
|
||||||
|
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
||||||
|
|
||||||
|
const int i = blockIdx.x;
|
||||||
|
|
||||||
|
// assume 32 threads
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
const int il = tid/8;
|
||||||
|
const int ir = tid%8;
|
||||||
|
const int ib = 8*i + ir;
|
||||||
|
if (ib >= nb32) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
dst_t * y = yy + 256*i + 32*ir + 4*il;
|
||||||
|
|
||||||
|
const block_q4_1 * x = (const block_q4_1 *)vx + ib;
|
||||||
|
const float2 d = __half22float2(x->dm);
|
||||||
|
|
||||||
|
const uint8_t * q = x->qs + 4*il;
|
||||||
|
|
||||||
|
for (int l = 0; l < 4; ++l) {
|
||||||
|
y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
|
||||||
|
y[l+16] = d.x * (q[l] >> 4) + d.y;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//================================== k-quants
|
//================================== k-quants
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
|
@ -5076,10 +5133,10 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
||||||
const block_q_t * x = (const block_q_t *) vx;
|
const block_q_t * x = (const block_q_t *) vx;
|
||||||
const block_q8_1 * y = (const block_q8_1 *) vy;
|
const block_q8_1 * y = (const block_q8_1 *) vy;
|
||||||
|
|
||||||
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
|
||||||
const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
|
const int ibx = row*blocks_per_row + i; // x block index
|
||||||
|
|
||||||
const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
|
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
||||||
|
|
||||||
const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
|
const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
|
||||||
|
|
||||||
|
@ -6253,6 +6310,20 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename dst_t>
|
||||||
|
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||||
|
const int nb32 = k / 32;
|
||||||
|
const int nb = (k + 255) / 256;
|
||||||
|
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename dst_t>
|
||||||
|
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||||
|
const int nb32 = k / 32;
|
||||||
|
const int nb = (k + 255) / 256;
|
||||||
|
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
||||||
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
@ -6301,9 +6372,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||||
int id;
|
int id;
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
return dequantize_row_q4_0_cuda;
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
return dequantize_row_q4_1_cuda;
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_TYPE_Q5_0:
|
||||||
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
|
@ -6338,9 +6409,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||||
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
return dequantize_row_q4_0_cuda;
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
return dequantize_row_q4_1_cuda;
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_TYPE_Q5_0:
|
||||||
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
|
@ -7546,11 +7617,11 @@ struct cuda_pool_alloc {
|
||||||
|
|
||||||
static bool g_cublas_loaded = false;
|
static bool g_cublas_loaded = false;
|
||||||
|
|
||||||
bool ggml_cublas_loaded(void) {
|
GGML_CALL bool ggml_cublas_loaded(void) {
|
||||||
return g_cublas_loaded;
|
return g_cublas_loaded;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_init_cublas() {
|
GGML_CALL void ggml_init_cublas() {
|
||||||
static bool initialized = false;
|
static bool initialized = false;
|
||||||
|
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
|
@ -7638,7 +7709,7 @@ void ggml_init_cublas() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void * ggml_cuda_host_malloc(size_t size) {
|
GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
|
||||||
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
@ -7656,7 +7727,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_host_free(void * ptr) {
|
GGML_CALL void ggml_cuda_host_free(void * ptr) {
|
||||||
CUDA_CHECK(cudaFreeHost(ptr));
|
CUDA_CHECK(cudaFreeHost(ptr));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9173,7 +9244,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
||||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||||
if (!g_cublas_loaded) return false;
|
if (!g_cublas_loaded) return false;
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
|
@ -9944,7 +10015,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
|
||||||
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_set_main_device(const int main_device) {
|
GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
|
||||||
if (main_device >= g_device_count) {
|
if (main_device >= g_device_count) {
|
||||||
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
||||||
main_device, g_device_count, g_main_device);
|
main_device, g_device_count, g_main_device);
|
||||||
|
@ -9959,7 +10030,7 @@ static void ggml_cuda_set_main_device(const int main_device) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
||||||
if (!g_cublas_loaded) return false;
|
if (!g_cublas_loaded) return false;
|
||||||
|
|
||||||
ggml_cuda_func_t func;
|
ggml_cuda_func_t func;
|
||||||
|
@ -10117,7 +10188,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_cuda_get_device_count() {
|
GGML_CALL int ggml_cuda_get_device_count() {
|
||||||
int device_count;
|
int device_count;
|
||||||
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
|
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -10125,7 +10196,7 @@ int ggml_cuda_get_device_count() {
|
||||||
return device_count;
|
return device_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
||||||
snprintf(description, description_size, "%s", prop.name);
|
snprintf(description, description_size, "%s", prop.name);
|
||||||
|
@ -10175,27 +10246,27 @@ struct ggml_backend_cuda_buffer_context {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
return ctx->name.c_str();
|
return ctx->name.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
||||||
return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
|
return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
||||||
delete ctx;
|
delete ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
return ctx->dev_ptr;
|
return ctx->dev_ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
|
|
||||||
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
||||||
|
@ -10227,7 +10298,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
|
@ -10238,7 +10309,7 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
||||||
CUDA_CHECK(cudaDeviceSynchronize());
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
|
@ -10249,7 +10320,7 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co
|
||||||
CUDA_CHECK(cudaDeviceSynchronize());
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
||||||
if (ggml_backend_buffer_is_cuda(src->buffer)) {
|
if (ggml_backend_buffer_is_cuda(src->buffer)) {
|
||||||
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
|
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
|
||||||
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
|
@ -10266,7 +10337,7 @@ static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, co
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
|
|
||||||
ggml_cuda_set_device(ctx->device);
|
ggml_cuda_set_device(ctx->device);
|
||||||
|
@ -10288,19 +10359,18 @@ static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
||||||
};
|
};
|
||||||
|
|
||||||
// cuda buffer type
|
// cuda buffer type
|
||||||
|
|
||||||
struct ggml_backend_cuda_buffer_type_context {
|
struct ggml_backend_cuda_buffer_type_context {
|
||||||
int device;
|
int device;
|
||||||
std::string name;
|
std::string name;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||||
ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||||
|
|
||||||
return ctx->name.c_str();
|
return ctx->name.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||||
|
|
||||||
ggml_cuda_set_device(buft_ctx->device);
|
ggml_cuda_set_device(buft_ctx->device);
|
||||||
|
@ -10319,13 +10389,13 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
||||||
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
|
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||||
return 128;
|
return 128;
|
||||||
|
|
||||||
UNUSED(buft);
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||||
int64_t row_low = 0;
|
int64_t row_low = 0;
|
||||||
int64_t row_high = ggml_nrows(tensor);
|
int64_t row_high = ggml_nrows(tensor);
|
||||||
int64_t nrows_split = row_high - row_low;
|
int64_t nrows_split = row_high - row_low;
|
||||||
|
@ -10345,7 +10415,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
||||||
UNUSED(buft);
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||||
if (!ggml_backend_is_cuda(backend)) {
|
if (!ggml_backend_is_cuda(backend)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -10365,7 +10435,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
||||||
/* .is_host = */ NULL,
|
/* .is_host = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||||
// FIXME: this is not thread safe
|
// FIXME: this is not thread safe
|
||||||
if (device >= ggml_backend_cuda_get_device_count()) {
|
if (device >= ggml_backend_cuda_get_device_count()) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -10410,7 +10480,7 @@ struct ggml_backend_cuda_split_buffer_context {
|
||||||
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||||
return GGML_CUDA_NAME "_Split";
|
return GGML_CUDA_NAME "_Split";
|
||||||
|
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
|
@ -10421,19 +10491,19 @@ static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_
|
||||||
// return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
// return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
||||||
//}
|
//}
|
||||||
|
|
||||||
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||||
delete ctx;
|
delete ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
// the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
|
// the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
|
||||||
return (void *)0x1000;
|
return (void *)0x1000;
|
||||||
|
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||||
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
||||||
|
|
||||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||||
|
@ -10483,7 +10553,7 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
|
||||||
tensor->extra = extra;
|
tensor->extra = extra;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
// split tensors must always be set in their entirety at once
|
// split tensors must always be set in their entirety at once
|
||||||
GGML_ASSERT(offset == 0);
|
GGML_ASSERT(offset == 0);
|
||||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||||
|
@ -10517,7 +10587,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
// split tensors must always be set in their entirety at once
|
// split tensors must always be set in their entirety at once
|
||||||
GGML_ASSERT(offset == 0);
|
GGML_ASSERT(offset == 0);
|
||||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||||
|
@ -10551,7 +10621,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
UNUSED(value);
|
UNUSED(value);
|
||||||
}
|
}
|
||||||
|
@ -10570,13 +10640,13 @@ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
||||||
|
|
||||||
// cuda split buffer type
|
// cuda split buffer type
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||||
return GGML_CUDA_NAME "_Split";
|
return GGML_CUDA_NAME "_Split";
|
||||||
|
|
||||||
UNUSED(buft);
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
||||||
// instead, we allocate them for each tensor separately in init_tensor
|
// instead, we allocate them for each tensor separately in init_tensor
|
||||||
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
|
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
|
||||||
|
@ -10586,13 +10656,13 @@ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(gg
|
||||||
return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
|
return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||||
return 128;
|
return 128;
|
||||||
|
|
||||||
UNUSED(buft);
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||||
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
||||||
|
|
||||||
size_t total_size = 0;
|
size_t total_size = 0;
|
||||||
|
@ -10619,13 +10689,13 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_bu
|
||||||
return total_size;
|
return total_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||||
return ggml_backend_is_cuda(backend);
|
return ggml_backend_is_cuda(backend);
|
||||||
|
|
||||||
UNUSED(buft);
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
UNUSED(buft);
|
UNUSED(buft);
|
||||||
|
@ -10640,7 +10710,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
|
||||||
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
||||||
// FIXME: this is not thread safe
|
// FIXME: this is not thread safe
|
||||||
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
||||||
|
|
||||||
|
@ -10676,23 +10746,23 @@ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * ten
|
||||||
|
|
||||||
// host buffer type
|
// host buffer type
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||||
return GGML_CUDA_NAME "_Host";
|
return GGML_CUDA_NAME "_Host";
|
||||||
|
|
||||||
UNUSED(buft);
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||||
return GGML_CUDA_NAME "_Host";
|
return GGML_CUDA_NAME "_Host";
|
||||||
|
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_cuda_host_free(buffer->context);
|
ggml_cuda_host_free(buffer->context);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
void * ptr = ggml_cuda_host_malloc(size);
|
void * ptr = ggml_cuda_host_malloc(size);
|
||||||
|
|
||||||
if (ptr == nullptr) {
|
if (ptr == nullptr) {
|
||||||
|
@ -10708,7 +10778,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
||||||
/* .iface = */ {
|
/* .iface = */ {
|
||||||
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
||||||
|
@ -10730,26 +10800,26 @@ static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
|
||||||
|
|
||||||
// backend
|
// backend
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
return cuda_ctx->name.c_str();
|
return cuda_ctx->name.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
delete cuda_ctx;
|
delete cuda_ctx;
|
||||||
delete backend;
|
delete backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||||
|
|
||||||
|
@ -10759,7 +10829,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
|
||||||
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||||
|
|
||||||
|
@ -10769,11 +10839,8 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
|
||||||
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
||||||
if (!ggml_backend_is_cuda(backend_src) && !ggml_backend_is_cuda(backend_dst)) {
|
GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
|
||||||
printf("not cuda either %s -> %s\n", src->name, dst->name);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// host -> device
|
// host -> device
|
||||||
if (ggml_backend_buffer_is_cuda_host(src->buffer) && ggml_backend_buffer_is_cuda(dst->buffer)) {
|
if (ggml_backend_buffer_is_cuda_host(src->buffer) && ggml_backend_buffer_is_cuda(dst->buffer)) {
|
||||||
|
@ -10827,7 +10894,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
||||||
|
@ -10835,7 +10902,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||||
UNUSED(backend);
|
UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
ggml_cuda_set_main_device(cuda_ctx->device);
|
ggml_cuda_set_main_device(cuda_ctx->device);
|
||||||
|
@ -10874,7 +10941,7 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||||
switch (op->op) {
|
switch (op->op) {
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(op)) {
|
switch (ggml_get_unary_op(op)) {
|
||||||
|
@ -10903,6 +10970,12 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
||||||
if (a->ne[3] != b->ne[3]) {
|
if (a->ne[3] != b->ne[3]) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
ggml_type a_type = a->type;
|
||||||
|
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS) {
|
||||||
|
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_GET_ROWS:
|
case GGML_OP_GET_ROWS:
|
||||||
|
@ -11000,7 +11073,7 @@ static ggml_backend_i ggml_backend_cuda_interface = {
|
||||||
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_cuda_init(int device) {
|
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
||||||
ggml_init_cublas(); // TODO: remove from ggml.c
|
ggml_init_cublas(); // TODO: remove from ggml.c
|
||||||
|
|
||||||
if (device < 0 || device >= ggml_cuda_get_device_count()) {
|
if (device < 0 || device >= ggml_cuda_get_device_count()) {
|
||||||
|
@ -11024,35 +11097,35 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
|
||||||
return cuda_backend;
|
return cuda_backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
||||||
return backend && backend->iface.get_name == ggml_backend_cuda_name;
|
return backend && backend->iface.get_name == ggml_backend_cuda_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_backend_cuda_get_device_count() {
|
GGML_CALL int ggml_backend_cuda_get_device_count() {
|
||||||
return ggml_cuda_get_device_count();
|
return ggml_cuda_get_device_count();
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
|
GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
|
||||||
ggml_cuda_get_device_description(device, description, description_size);
|
ggml_cuda_get_device_description(device, description, description_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
|
GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
|
||||||
ggml_cuda_set_device(device);
|
ggml_cuda_set_device(device);
|
||||||
|
|
||||||
CUDA_CHECK(cudaMemGetInfo(free, total));
|
CUDA_CHECK(cudaMemGetInfo(free, total));
|
||||||
}
|
}
|
||||||
|
|
||||||
// backend registry
|
// backend registry
|
||||||
static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
||||||
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
||||||
return cuda_backend;
|
return cuda_backend;
|
||||||
|
|
||||||
UNUSED(params);
|
UNUSED(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" int ggml_backend_cuda_reg_devices();
|
extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
|
||||||
|
|
||||||
int ggml_backend_cuda_reg_devices() {
|
GGML_CALL int ggml_backend_cuda_reg_devices() {
|
||||||
int device_count = ggml_cuda_get_device_count();
|
int device_count = ggml_cuda_get_device_count();
|
||||||
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
||||||
for (int i = 0; i < device_count; i++) {
|
for (int i = 0; i < device_count; i++) {
|
||||||
|
|
32
ggml-cuda.h
|
@ -18,34 +18,34 @@ extern "C" {
|
||||||
#define GGML_CUDA_MAX_DEVICES 16
|
#define GGML_CUDA_MAX_DEVICES 16
|
||||||
|
|
||||||
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
|
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
|
||||||
GGML_API void ggml_init_cublas(void);
|
GGML_API GGML_CALL void ggml_init_cublas(void);
|
||||||
|
|
||||||
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
|
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
|
||||||
GGML_API bool ggml_cublas_loaded(void);
|
GGML_API GGML_CALL bool ggml_cublas_loaded(void);
|
||||||
|
|
||||||
GGML_API void * ggml_cuda_host_malloc(size_t size);
|
GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
|
||||||
GGML_API void ggml_cuda_host_free(void * ptr);
|
GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr);
|
||||||
|
|
||||||
GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
GGML_API GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||||
GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
GGML_API GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API int ggml_cuda_get_device_count(void);
|
GGML_API GGML_CALL int ggml_cuda_get_device_count(void);
|
||||||
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
|
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||||
// split tensor buffer that splits matrices by rows across multiple devices
|
// split tensor buffer that splits matrices by rows across multiple devices
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
||||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||||
|
|
||||||
GGML_API int ggml_backend_cuda_get_device_count(void);
|
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
|
||||||
GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||||
GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
60
ggml-metal.h
|
@ -27,7 +27,6 @@
|
||||||
|
|
||||||
// max memory buffers that can be mapped to the device
|
// max memory buffers that can be mapped to the device
|
||||||
#define GGML_METAL_MAX_BUFFERS 64
|
#define GGML_METAL_MAX_BUFFERS 64
|
||||||
#define GGML_METAL_MAX_COMMAND_BUFFERS 32
|
|
||||||
|
|
||||||
struct ggml_tensor;
|
struct ggml_tensor;
|
||||||
struct ggml_cgraph;
|
struct ggml_cgraph;
|
||||||
|
@ -36,73 +35,22 @@ struct ggml_cgraph;
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
//
|
|
||||||
// internal API
|
|
||||||
// temporary exposed to user-code
|
|
||||||
//
|
|
||||||
|
|
||||||
struct ggml_metal_context;
|
|
||||||
|
|
||||||
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
|
||||||
|
|
||||||
// number of command buffers to use
|
|
||||||
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
|
||||||
void ggml_metal_free(struct ggml_metal_context * ctx);
|
|
||||||
|
|
||||||
void * ggml_metal_host_malloc(size_t n);
|
|
||||||
void ggml_metal_host_free (void * data);
|
|
||||||
|
|
||||||
// set the number of command buffers to use
|
|
||||||
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
|
|
||||||
|
|
||||||
// creates a mapping between a host memory buffer and a device memory buffer
|
|
||||||
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
|
||||||
// - the mapping is used during computation to determine the arguments of the compute kernels
|
|
||||||
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
|
|
||||||
// - max_size specifies the maximum size of a tensor and is used to create shared views such
|
|
||||||
// that it is guaranteed that the tensor will fit in at least one of the views
|
|
||||||
//
|
|
||||||
bool ggml_metal_add_buffer(
|
|
||||||
struct ggml_metal_context * ctx,
|
|
||||||
const char * name,
|
|
||||||
void * data,
|
|
||||||
size_t size,
|
|
||||||
size_t max_size);
|
|
||||||
|
|
||||||
// set data from host memory into the device
|
|
||||||
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
|
||||||
|
|
||||||
// get data from the device into host memory
|
|
||||||
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
|
||||||
|
|
||||||
// try to find operations that can be run concurrently in the graph
|
|
||||||
// you should run it again if the topology of your graph changes
|
|
||||||
void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
|
|
||||||
|
|
||||||
// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
|
|
||||||
int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
|
|
||||||
|
|
||||||
// output the concur_list for ggml_alloc
|
|
||||||
int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
|
|
||||||
|
|
||||||
// same as ggml_graph_compute but uses Metal
|
|
||||||
// creates gf->n_threads command buffers in parallel
|
|
||||||
bool ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// backend API
|
// backend API
|
||||||
// user-code should use only these functions
|
// user-code should use only these functions
|
||||||
//
|
//
|
||||||
|
|
||||||
|
GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
||||||
|
|
||||||
GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
||||||
|
|
||||||
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||||
|
|
||||||
// helper to check if the device supports a specific family
|
// helper to check if the device supports a specific family
|
||||||
// ideally, the user code should be doing these checks
|
// ideally, the user code should be doing these checks
|
||||||
|
|
447
ggml-metal.m
|
@ -24,8 +24,6 @@
|
||||||
|
|
||||||
#define UNUSED(x) (void)(x)
|
#define UNUSED(x) (void)(x)
|
||||||
|
|
||||||
#define GGML_MAX_CONCUR (2*GGML_DEFAULT_GRAPH_SIZE)
|
|
||||||
|
|
||||||
#define GGML_METAL_MAX_KERNELS 256
|
#define GGML_METAL_MAX_KERNELS 256
|
||||||
|
|
||||||
struct ggml_metal_buffer {
|
struct ggml_metal_buffer {
|
||||||
|
@ -172,9 +170,6 @@ struct ggml_metal_context {
|
||||||
id<MTLCommandQueue> queue;
|
id<MTLCommandQueue> queue;
|
||||||
id<MTLLibrary> library;
|
id<MTLLibrary> library;
|
||||||
|
|
||||||
id<MTLCommandBuffer> command_buffers [GGML_METAL_MAX_COMMAND_BUFFERS];
|
|
||||||
id<MTLComputeCommandEncoder> command_encoders[GGML_METAL_MAX_COMMAND_BUFFERS];
|
|
||||||
|
|
||||||
dispatch_queue_t d_queue;
|
dispatch_queue_t d_queue;
|
||||||
|
|
||||||
int n_buffers;
|
int n_buffers;
|
||||||
|
@ -182,9 +177,6 @@ struct ggml_metal_context {
|
||||||
|
|
||||||
struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS];
|
struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS];
|
||||||
|
|
||||||
int concur_list[GGML_MAX_CONCUR];
|
|
||||||
int concur_list_len;
|
|
||||||
|
|
||||||
bool support_simdgroup_reduction;
|
bool support_simdgroup_reduction;
|
||||||
bool support_simdgroup_mm;
|
bool support_simdgroup_mm;
|
||||||
};
|
};
|
||||||
|
@ -200,7 +192,6 @@ struct ggml_metal_context {
|
||||||
@implementation GGMLMetalClass
|
@implementation GGMLMetalClass
|
||||||
@end
|
@end
|
||||||
|
|
||||||
|
|
||||||
static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
||||||
fprintf(stderr, "%s", msg);
|
fprintf(stderr, "%s", msg);
|
||||||
|
|
||||||
|
@ -211,11 +202,6 @@ static void ggml_metal_default_log_callback(enum ggml_log_level level, const cha
|
||||||
ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
|
ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
|
||||||
void * ggml_metal_log_user_data = NULL;
|
void * ggml_metal_log_user_data = NULL;
|
||||||
|
|
||||||
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
|
||||||
ggml_metal_log_callback = log_callback;
|
|
||||||
ggml_metal_log_user_data = user_data;
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_ATTRIBUTE_FORMAT(2, 3)
|
GGML_ATTRIBUTE_FORMAT(2, 3)
|
||||||
static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
|
static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
|
||||||
if (ggml_metal_log_callback != NULL) {
|
if (ggml_metal_log_callback != NULL) {
|
||||||
|
@ -238,24 +224,33 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
static void * ggml_metal_host_malloc(size_t n) {
|
||||||
|
void * data = NULL;
|
||||||
|
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
|
||||||
|
if (result != 0) {
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
|
GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
|
||||||
|
|
||||||
id<MTLDevice> device;
|
#if TARGET_OS_OSX && !GGML_METAL_NDEBUG
|
||||||
NSString * s;
|
|
||||||
|
|
||||||
#if TARGET_OS_OSX
|
|
||||||
// Show all the Metal device instances in the system
|
// Show all the Metal device instances in the system
|
||||||
NSArray * devices = MTLCopyAllDevices();
|
NSArray * devices = MTLCopyAllDevices();
|
||||||
for (device in devices) {
|
for (id<MTLDevice> device in devices) {
|
||||||
s = [device name];
|
NSString * s = [device name];
|
||||||
GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
|
GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
|
||||||
}
|
}
|
||||||
|
[devices release]; // since it was created by a *Copy* C method
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Pick and show default Metal device
|
// Pick and show default Metal device
|
||||||
device = MTLCreateSystemDefaultDevice();
|
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
||||||
s = [device name];
|
NSString * s = [device name];
|
||||||
GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);
|
GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);
|
||||||
|
|
||||||
// Configure context
|
// Configure context
|
||||||
|
@ -264,7 +259,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
||||||
ctx->queue = [ctx->device newCommandQueue];
|
ctx->queue = [ctx->device newCommandQueue];
|
||||||
ctx->n_buffers = 0;
|
ctx->n_buffers = 0;
|
||||||
ctx->concur_list_len = 0;
|
|
||||||
|
|
||||||
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||||
|
|
||||||
|
@ -307,6 +301,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@autoreleasepool {
|
||||||
// dictionary of preprocessor macros
|
// dictionary of preprocessor macros
|
||||||
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
||||||
|
|
||||||
|
@ -320,9 +315,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
//[options setFastMathEnabled:false];
|
//[options setFastMathEnabled:false];
|
||||||
|
|
||||||
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
||||||
|
}
|
||||||
[options release];
|
|
||||||
[prep release];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
|
@ -331,7 +324,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if TARGET_OS_OSX
|
|
||||||
// print MTL GPU family:
|
// print MTL GPU family:
|
||||||
GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]);
|
GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]);
|
||||||
|
|
||||||
|
@ -371,7 +363,12 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_LOG_INFO("%s: simdgroup reduction support = %s\n", __func__, ctx->support_simdgroup_reduction ? "true" : "false");
|
GGML_METAL_LOG_INFO("%s: simdgroup reduction support = %s\n", __func__, ctx->support_simdgroup_reduction ? "true" : "false");
|
||||||
GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false");
|
GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false");
|
||||||
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||||
|
|
||||||
|
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
|
||||||
|
if (@available(macOS 10.12, iOS 16.0, *)) {
|
||||||
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
|
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
|
||||||
|
}
|
||||||
|
#elif TARGET_OS_OSX
|
||||||
if (ctx->device.maxTransferRate != 0) {
|
if (ctx->device.maxTransferRate != 0) {
|
||||||
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
|
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
|
||||||
} else {
|
} else {
|
||||||
|
@ -531,7 +528,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_metal_free(struct ggml_metal_context * ctx) {
|
static void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||||
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
|
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
|
||||||
|
|
||||||
for (int i = 0; i < ctx->n_buffers; ++i) {
|
for (int i = 0; i < ctx->n_buffers; ++i) {
|
||||||
|
@ -557,33 +554,6 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||||
free(ctx);
|
free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
void * ggml_metal_host_malloc(size_t n) {
|
|
||||||
void * data = NULL;
|
|
||||||
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
|
|
||||||
if (result != 0) {
|
|
||||||
GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_metal_host_free(void * data) {
|
|
||||||
free(data);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
|
|
||||||
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
|
||||||
}
|
|
||||||
|
|
||||||
int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
|
|
||||||
return ctx->concur_list_len;
|
|
||||||
}
|
|
||||||
|
|
||||||
int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
|
|
||||||
return ctx->concur_list;
|
|
||||||
}
|
|
||||||
|
|
||||||
// temporarily defined here for compatibility between ggml-backend and the old API
|
// temporarily defined here for compatibility between ggml-backend and the old API
|
||||||
|
|
||||||
struct ggml_backend_metal_buffer {
|
struct ggml_backend_metal_buffer {
|
||||||
|
@ -656,209 +626,6 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
||||||
return nil;
|
return nil;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_metal_add_buffer(
|
|
||||||
struct ggml_metal_context * ctx,
|
|
||||||
const char * name,
|
|
||||||
void * data,
|
|
||||||
size_t size,
|
|
||||||
size_t max_size) {
|
|
||||||
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
|
|
||||||
GGML_METAL_LOG_ERROR("%s: error: too many buffers\n", __func__);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (data) {
|
|
||||||
// verify that the buffer does not overlap with any of the existing buffers
|
|
||||||
for (int i = 0; i < ctx->n_buffers; ++i) {
|
|
||||||
const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;
|
|
||||||
|
|
||||||
if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
|
|
||||||
GGML_METAL_LOG_ERROR("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const size_t size_page = sysconf(_SC_PAGESIZE);
|
|
||||||
|
|
||||||
size_t size_aligned = size;
|
|
||||||
if ((size_aligned % size_page) != 0) {
|
|
||||||
size_aligned += (size_page - (size_aligned % size_page));
|
|
||||||
}
|
|
||||||
|
|
||||||
// the buffer fits into the max buffer size allowed by the device
|
|
||||||
if (size_aligned <= ctx->device.maxBufferLength) {
|
|
||||||
ctx->buffers[ctx->n_buffers].name = name;
|
|
||||||
ctx->buffers[ctx->n_buffers].data = data;
|
|
||||||
ctx->buffers[ctx->n_buffers].size = size;
|
|
||||||
|
|
||||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
|
||||||
|
|
||||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
|
||||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
|
||||||
|
|
||||||
++ctx->n_buffers;
|
|
||||||
} else {
|
|
||||||
// this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
|
|
||||||
// one of the views
|
|
||||||
const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
|
|
||||||
const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
|
|
||||||
const size_t size_view = ctx->device.maxBufferLength;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < size; i += size_step) {
|
|
||||||
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
|
||||||
|
|
||||||
ctx->buffers[ctx->n_buffers].name = name;
|
|
||||||
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
|
|
||||||
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
|
||||||
|
|
||||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
|
||||||
|
|
||||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
|
||||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
|
||||||
if (i + size_step < size) {
|
|
||||||
GGML_METAL_LOG_INFO("\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
++ctx->n_buffers;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if TARGET_OS_OSX
|
|
||||||
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
|
||||||
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
|
||||||
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
|
||||||
|
|
||||||
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
|
|
||||||
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
|
||||||
} else {
|
|
||||||
GGML_METAL_LOG_INFO("\n");
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_metal_set_tensor(
|
|
||||||
struct ggml_metal_context * ctx,
|
|
||||||
struct ggml_tensor * t) {
|
|
||||||
size_t offs;
|
|
||||||
id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
|
|
||||||
|
|
||||||
memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t));
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_metal_get_tensor(
|
|
||||||
struct ggml_metal_context * ctx,
|
|
||||||
struct ggml_tensor * t) {
|
|
||||||
size_t offs;
|
|
||||||
id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
|
|
||||||
|
|
||||||
memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_metal_graph_find_concurrency(
|
|
||||||
struct ggml_metal_context * ctx,
|
|
||||||
struct ggml_cgraph * gf, bool check_mem) {
|
|
||||||
int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
|
|
||||||
int nodes_unused[GGML_MAX_CONCUR];
|
|
||||||
|
|
||||||
for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; }
|
|
||||||
for (int i = 0; i < gf->n_nodes; i++) { nodes_unused[i] = 1; }
|
|
||||||
ctx->concur_list_len = 0;
|
|
||||||
|
|
||||||
int n_left = gf->n_nodes;
|
|
||||||
int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
|
|
||||||
int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos
|
|
||||||
|
|
||||||
while (n_left > 0) {
|
|
||||||
// number of nodes at a layer (that can be issued concurrently)
|
|
||||||
int concurrency = 0;
|
|
||||||
for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
|
|
||||||
if (nodes_unused[i]) {
|
|
||||||
// if the requirements for gf->nodes[i] are satisfied
|
|
||||||
int exe_flag = 1;
|
|
||||||
|
|
||||||
// scan all srcs
|
|
||||||
for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
|
|
||||||
struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
|
|
||||||
if (src_cur) {
|
|
||||||
// if is leaf nodes it's satisfied.
|
|
||||||
// TODO: ggml_is_leaf()
|
|
||||||
if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// otherwise this src should be the output from previous nodes.
|
|
||||||
int is_found = 0;
|
|
||||||
|
|
||||||
// scan 2*search_depth back because we inserted barrier.
|
|
||||||
//for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
|
|
||||||
for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) {
|
|
||||||
if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) {
|
|
||||||
is_found = 1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (is_found == 0) {
|
|
||||||
exe_flag = 0;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (exe_flag && check_mem) {
|
|
||||||
// check if nodes[i]'s data will be overwritten by a node before nodes[i].
|
|
||||||
// if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
|
|
||||||
int64_t data_start = (int64_t) gf->nodes[i]->data;
|
|
||||||
int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]);
|
|
||||||
for (int j = n_start; j < i; j++) {
|
|
||||||
if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
|
|
||||||
&& gf->nodes[j]->op != GGML_OP_VIEW \
|
|
||||||
&& gf->nodes[j]->op != GGML_OP_TRANSPOSE \
|
|
||||||
&& gf->nodes[j]->op != GGML_OP_PERMUTE) {
|
|
||||||
if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
|
|
||||||
((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
exe_flag = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (exe_flag) {
|
|
||||||
ctx->concur_list[level_pos + concurrency] = i;
|
|
||||||
nodes_unused[i] = 0;
|
|
||||||
concurrency++;
|
|
||||||
ctx->concur_list_len++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
n_left -= concurrency;
|
|
||||||
// adding a barrier different layer
|
|
||||||
ctx->concur_list[level_pos + concurrency] = -1;
|
|
||||||
ctx->concur_list_len++;
|
|
||||||
// jump all sorted nodes at nodes_bak
|
|
||||||
while (!nodes_unused[n_start]) {
|
|
||||||
n_start++;
|
|
||||||
}
|
|
||||||
level_pos += concurrency + 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ctx->concur_list_len > GGML_MAX_CONCUR) {
|
|
||||||
GGML_METAL_LOG_WARN("%s: too many elements for metal ctx->concur_list!\n", __func__);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
|
static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
|
||||||
switch (op->op) {
|
switch (op->op) {
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
|
@ -940,51 +707,44 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_metal_graph_compute(
|
static bool ggml_metal_graph_compute(
|
||||||
struct ggml_metal_context * ctx,
|
struct ggml_metal_context * ctx,
|
||||||
struct ggml_cgraph * gf) {
|
struct ggml_cgraph * gf) {
|
||||||
@autoreleasepool {
|
|
||||||
|
|
||||||
// if there is ctx->concur_list, dispatch concurrently
|
|
||||||
// else fallback to serial dispatch
|
|
||||||
MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
|
MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
|
||||||
|
edesc.dispatchType = MTLDispatchTypeSerial;
|
||||||
const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR;
|
|
||||||
|
|
||||||
const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes;
|
|
||||||
edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
|
|
||||||
|
|
||||||
// create multiple command buffers and enqueue them
|
// create multiple command buffers and enqueue them
|
||||||
// then, we encode the graph into the command buffers in parallel
|
// then, we encode the graph into the command buffers in parallel
|
||||||
|
|
||||||
|
const int n_nodes = gf->n_nodes;
|
||||||
const int n_cb = ctx->n_cb;
|
const int n_cb = ctx->n_cb;
|
||||||
|
|
||||||
for (int i = 0; i < n_cb; ++i) {
|
|
||||||
ctx->command_buffers[i] = [ctx->queue commandBuffer];
|
|
||||||
|
|
||||||
// enqueue the command buffers in order to specify their execution order
|
|
||||||
[ctx->command_buffers[i] enqueue];
|
|
||||||
|
|
||||||
ctx->command_encoders[i] = [ctx->command_buffers[i] computeCommandEncoderWithDescriptor: edesc];
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
|
||||||
const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
|
const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
|
||||||
|
|
||||||
dispatch_async(ctx->d_queue, ^{
|
id<MTLCommandBuffer> command_buffer_builder[n_cb];
|
||||||
|
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
||||||
|
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
|
||||||
|
command_buffer_builder[cb_idx] = command_buffer;
|
||||||
|
|
||||||
|
// enqueue the command buffers in order to specify their execution order
|
||||||
|
[command_buffer enqueue];
|
||||||
|
}
|
||||||
|
const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
|
||||||
|
|
||||||
|
dispatch_apply(n_cb, ctx->d_queue, ^(size_t iter) {
|
||||||
|
const int cb_idx = iter;
|
||||||
|
|
||||||
size_t offs_src0 = 0;
|
size_t offs_src0 = 0;
|
||||||
size_t offs_src1 = 0;
|
size_t offs_src1 = 0;
|
||||||
size_t offs_dst = 0;
|
size_t offs_dst = 0;
|
||||||
|
|
||||||
id<MTLCommandBuffer> command_buffer = ctx->command_buffers[cb_idx];
|
id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
|
||||||
id<MTLComputeCommandEncoder> encoder = ctx->command_encoders[cb_idx];
|
id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||||
|
|
||||||
const int node_start = (cb_idx + 0) * n_nodes_per_cb;
|
const int node_start = (cb_idx + 0) * n_nodes_per_cb;
|
||||||
const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
|
const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
|
||||||
|
|
||||||
for (int ind = node_start; ind < node_end; ++ind) {
|
for (int i = node_start; i < node_end; ++i) {
|
||||||
const int i = has_concur ? ctx->concur_list[ind] : ind;
|
|
||||||
|
|
||||||
if (i == -1) {
|
if (i == -1) {
|
||||||
[encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
|
[encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
|
||||||
continue;
|
continue;
|
||||||
|
@ -2472,24 +2232,19 @@ bool ggml_metal_graph_compute(
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if (encoder != nil) {
|
|
||||||
[encoder endEncoding];
|
[encoder endEncoding];
|
||||||
encoder = nil;
|
|
||||||
}
|
|
||||||
|
|
||||||
[command_buffer commit];
|
[command_buffer commit];
|
||||||
});
|
});
|
||||||
}
|
|
||||||
|
|
||||||
// wait for all threads to finish
|
// Wait for completion and check status of each command buffer
|
||||||
dispatch_barrier_sync(ctx->d_queue, ^{});
|
|
||||||
|
|
||||||
// check status of command buffers
|
|
||||||
// needed to detect if the device ran out-of-memory for example (#1881)
|
// needed to detect if the device ran out-of-memory for example (#1881)
|
||||||
for (int i = 0; i < n_cb; i++) {
|
|
||||||
[ctx->command_buffers[i] waitUntilCompleted];
|
|
||||||
|
|
||||||
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
|
for (int i = 0; i < n_cb; ++i) {
|
||||||
|
id<MTLCommandBuffer> command_buffer = command_buffers[i];
|
||||||
|
[command_buffer waitUntilCompleted];
|
||||||
|
|
||||||
|
MTLCommandBufferStatus status = [command_buffer status];
|
||||||
if (status != MTLCommandBufferStatusCompleted) {
|
if (status != MTLCommandBufferStatusCompleted) {
|
||||||
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
||||||
return false;
|
return false;
|
||||||
|
@ -2498,7 +2253,6 @@ bool ggml_metal_graph_compute(
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
@ -2529,13 +2283,13 @@ static void ggml_backend_metal_free_device(void) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
|
GGML_CALL static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||||
return "Metal";
|
return "Metal";
|
||||||
|
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
for (int i = 0; i < ctx->n_buffers; i++) {
|
for (int i = 0; i < ctx->n_buffers; i++) {
|
||||||
|
@ -2550,25 +2304,25 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||||
free(ctx);
|
free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
GGML_CALL static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
return ctx->all_data;
|
return ctx->all_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
GGML_CALL static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
memcpy((char *)tensor->data + offset, data, size);
|
memcpy((char *)tensor->data + offset, data, size);
|
||||||
|
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
GGML_CALL static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
memcpy(data, (const char *)tensor->data + offset, size);
|
memcpy(data, (const char *)tensor->data + offset, size);
|
||||||
|
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
GGML_CALL static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||||
if (ggml_backend_buffer_is_host(src->buffer)) {
|
if (ggml_backend_buffer_is_host(src->buffer)) {
|
||||||
memcpy(dst->data, src->data, ggml_nbytes(src));
|
memcpy(dst->data, src->data, ggml_nbytes(src));
|
||||||
return true;
|
return true;
|
||||||
|
@ -2578,7 +2332,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
GGML_CALL static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
memset(ctx->all_data, value, ctx->all_size);
|
memset(ctx->all_data, value, ctx->all_size);
|
||||||
|
@ -2598,13 +2352,32 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
||||||
|
|
||||||
// default buffer type
|
// default buffer type
|
||||||
|
|
||||||
static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
GGML_CALL static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||||
return "Metal";
|
return "Metal";
|
||||||
|
|
||||||
UNUSED(buft);
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device) {
|
||||||
|
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
|
||||||
|
if (@available(macOS 10.12, iOS 16.0, *)) {
|
||||||
|
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
||||||
|
device.currentAllocatedSize / 1024.0 / 1024.0,
|
||||||
|
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
||||||
|
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
||||||
|
} else {
|
||||||
|
GGML_METAL_LOG_INFO("\n");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
UNUSED(device);
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
||||||
|
|
||||||
const size_t size_page = sysconf(_SC_PAGESIZE);
|
const size_t size_page = sysconf(_SC_PAGESIZE);
|
||||||
|
@ -2636,44 +2409,29 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
|
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
|
||||||
|
ggml_backend_metal_log_allocated_size(device);
|
||||||
|
|
||||||
#if TARGET_OS_OSX
|
|
||||||
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
|
||||||
device.currentAllocatedSize / 1024.0 / 1024.0,
|
|
||||||
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
|
||||||
|
|
||||||
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
|
||||||
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
|
||||||
} else {
|
|
||||||
GGML_METAL_LOG_INFO("\n");
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
|
return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
GGML_CALL static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||||
return 32;
|
return 32;
|
||||||
UNUSED(buft);
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||||
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
||||||
|
|
||||||
UNUSED(buft);
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
UNUSED(buft);
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
|
static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
|
||||||
/* .iface = */ {
|
/* .iface = */ {
|
||||||
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
|
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
|
||||||
|
@ -2691,7 +2449,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
||||||
|
|
||||||
// buffer from ptr
|
// buffer from ptr
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
|
GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
||||||
|
|
||||||
ctx->all_data = data;
|
ctx->all_data = data;
|
||||||
|
@ -2759,50 +2517,38 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if TARGET_OS_OSX
|
ggml_backend_metal_log_allocated_size(device);
|
||||||
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
|
||||||
device.currentAllocatedSize / 1024.0 / 1024.0,
|
|
||||||
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
|
||||||
|
|
||||||
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
|
||||||
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
|
||||||
} else {
|
|
||||||
GGML_METAL_LOG_INFO("\n");
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
|
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
// backend
|
// backend
|
||||||
|
|
||||||
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
GGML_CALL static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
||||||
return "Metal";
|
return "Metal";
|
||||||
|
|
||||||
UNUSED(backend);
|
UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_free(ggml_backend_t backend) {
|
GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) {
|
||||||
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
||||||
ggml_metal_free(ctx);
|
ggml_metal_free(ctx);
|
||||||
free(backend);
|
free(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
||||||
return ggml_backend_metal_buffer_type();
|
return ggml_backend_metal_buffer_type();
|
||||||
|
|
||||||
UNUSED(backend);
|
UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
GGML_CALL static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
||||||
|
|
||||||
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
||||||
|
|
||||||
return ggml_metal_supports_op(metal_ctx, op);
|
return ggml_metal_supports_op(metal_ctx, op);
|
||||||
|
@ -2823,6 +2569,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
|
||||||
/* .supports_op = */ ggml_backend_metal_supports_op,
|
/* .supports_op = */ ggml_backend_metal_supports_op,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
||||||
|
ggml_metal_log_callback = log_callback;
|
||||||
|
ggml_metal_log_user_data = user_data;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_metal_init(void) {
|
ggml_backend_t ggml_backend_metal_init(void) {
|
||||||
struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
||||||
|
|
||||||
|
@ -2849,7 +2600,7 @@ void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
||||||
|
|
||||||
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
||||||
|
|
||||||
ggml_metal_set_n_cb(ctx, n_cb);
|
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
|
bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
|
||||||
|
@ -2860,9 +2611,9 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
|
||||||
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
|
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
|
GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
|
GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
|
||||||
return ggml_backend_metal_init();
|
return ggml_backend_metal_init();
|
||||||
|
|
||||||
GGML_UNUSED(params);
|
GGML_UNUSED(params);
|
||||||
|
|
1571
ggml-quants.c
|
@ -196,8 +196,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
|
||||||
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
||||||
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
||||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
||||||
void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k);
|
|
||||||
void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs * restrict y, int k);
|
|
||||||
|
|
||||||
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
|
||||||
|
@ -212,8 +210,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k);
|
|
||||||
void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k);
|
|
||||||
|
|
||||||
// Dequantization
|
// Dequantization
|
||||||
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
|
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
|
||||||
|
@ -246,3 +242,21 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx,
|
||||||
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||||
|
//
|
||||||
|
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
|
||||||
|
void iq2xs_init_impl(int grid_size);
|
||||||
|
void iq2xs_free_impl(int grid_size);
|
||||||
|
|
216
ggml.c
|
@ -394,12 +394,6 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
||||||
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
|
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
|
||||||
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
|
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
|
||||||
|
|
||||||
ggml_collect_imatrix_t g_imatrix_collect = NULL;
|
|
||||||
|
|
||||||
void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
|
|
||||||
g_imatrix_collect = imatrix_collect;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_I8] = {
|
[GGML_TYPE_I8] = {
|
||||||
.type_name = "i8",
|
.type_name = "i8",
|
||||||
|
@ -585,8 +579,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.type_size = sizeof(block_iq2_xxs),
|
.type_size = sizeof(block_iq2_xxs),
|
||||||
.is_quantized = true,
|
.is_quantized = true,
|
||||||
.to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
|
.to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
|
||||||
.from_float = quantize_row_iq2_xxs,
|
.from_float = NULL,
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_iq2_xxs_reference,
|
.from_float_reference = NULL,
|
||||||
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
},
|
},
|
||||||
|
@ -596,8 +590,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.type_size = sizeof(block_iq2_xs),
|
.type_size = sizeof(block_iq2_xs),
|
||||||
.is_quantized = true,
|
.is_quantized = true,
|
||||||
.to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
|
.to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
|
||||||
.from_float = quantize_row_iq2_xs,
|
.from_float = NULL,
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_iq2_xs_reference,
|
.from_float_reference = NULL,
|
||||||
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
},
|
},
|
||||||
|
@ -1990,19 +1984,19 @@ void ggml_print_objects(const struct ggml_context * ctx) {
|
||||||
GGML_PRINT("%s: --- end ---\n", __func__);
|
GGML_PRINT("%s: --- end ---\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
||||||
size_t nbytes;
|
size_t nbytes;
|
||||||
size_t blck_size = ggml_blck_size(tensor->type);
|
size_t blck_size = ggml_blck_size(tensor->type);
|
||||||
if (blck_size == 1) {
|
if (blck_size == 1) {
|
||||||
|
@ -2025,15 +2019,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
||||||
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_blck_size(enum ggml_type type) {
|
GGML_CALL int ggml_blck_size(enum ggml_type type) {
|
||||||
return type_traits[type].blck_size;
|
return type_traits[type].blck_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_type_size(enum ggml_type type) {
|
GGML_CALL size_t ggml_type_size(enum ggml_type type) {
|
||||||
return type_traits[type].type_size;
|
return type_traits[type].type_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
||||||
assert(ne % ggml_blck_size(type) == 0);
|
assert(ne % ggml_blck_size(type) == 0);
|
||||||
return ggml_type_size(type)*ne/ggml_blck_size(type);
|
return ggml_type_size(type)*ne/ggml_blck_size(type);
|
||||||
}
|
}
|
||||||
|
@ -2042,15 +2036,15 @@ double ggml_type_sizef(enum ggml_type type) {
|
||||||
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
|
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * ggml_type_name(enum ggml_type type) {
|
GGML_CALL const char * ggml_type_name(enum ggml_type type) {
|
||||||
return type_traits[type].type_name;
|
return type_traits[type].type_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_is_quantized(enum ggml_type type) {
|
GGML_CALL bool ggml_is_quantized(enum ggml_type type) {
|
||||||
return type_traits[type].is_quantized;
|
return type_traits[type].is_quantized;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * ggml_op_name(enum ggml_op op) {
|
GGML_CALL const char * ggml_op_name(enum ggml_op op) {
|
||||||
return GGML_OP_NAME[op];
|
return GGML_OP_NAME[op];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2062,7 +2056,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) {
|
||||||
return GGML_UNARY_OP_NAME[op];
|
return GGML_UNARY_OP_NAME[op];
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * ggml_op_desc(const struct ggml_tensor * t) {
|
GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
|
||||||
if (t->op == GGML_OP_UNARY) {
|
if (t->op == GGML_OP_UNARY) {
|
||||||
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
||||||
return ggml_unary_op_name(uop);
|
return ggml_unary_op_name(uop);
|
||||||
|
@ -2072,7 +2066,7 @@ const char * ggml_op_desc(const struct ggml_tensor * t) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
||||||
return ggml_type_size(tensor->type);
|
return ggml_type_size(tensor->type);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2154,11 +2148,11 @@ size_t ggml_tensor_overhead(void) {
|
||||||
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
|
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
||||||
return tensor->nb[0] > tensor->nb[1];
|
return tensor->nb[0] > tensor->nb[1];
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return
|
return
|
||||||
|
@ -2177,7 +2171,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
|
||||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
|
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
|
||||||
|
@ -3079,7 +3073,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
|
||||||
return (float *)(tensor->data);
|
return (float *)(tensor->data);
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
|
GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
|
||||||
GGML_ASSERT(tensor->op == GGML_OP_UNARY);
|
GGML_ASSERT(tensor->op == GGML_OP_UNARY);
|
||||||
return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
|
return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
|
||||||
}
|
}
|
||||||
|
@ -9790,10 +9784,6 @@ static void ggml_compute_forward_mul_mat(
|
||||||
const int ith = params->ith;
|
const int ith = params->ith;
|
||||||
const int nth = params->nth;
|
const int nth = params->nth;
|
||||||
|
|
||||||
if (ith == 1 && g_imatrix_collect) {
|
|
||||||
g_imatrix_collect(src0, src1);
|
|
||||||
}
|
|
||||||
|
|
||||||
const enum ggml_type type = src0->type;
|
const enum ggml_type type = src0->type;
|
||||||
|
|
||||||
const bool src1_cont = ggml_is_contiguous(src1);
|
const bool src1_cont = ggml_is_contiguous(src1);
|
||||||
|
@ -10097,10 +10087,6 @@ static void ggml_compute_forward_mul_mat_id(
|
||||||
|
|
||||||
const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
|
const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
|
||||||
|
|
||||||
if (ith == 1 && g_imatrix_collect) {
|
|
||||||
g_imatrix_collect(src0_cur, src1);
|
|
||||||
}
|
|
||||||
|
|
||||||
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||||
|
|
||||||
|
@ -11638,7 +11624,22 @@ static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, fl
|
||||||
return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
|
return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_rope_yarn_corr_dims(
|
static void ggml_rope_cache_init(
|
||||||
|
float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
||||||
|
float * cache, float sin_sign, float theta_scale
|
||||||
|
) {
|
||||||
|
float theta = theta_base;
|
||||||
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||||
|
rope_yarn(
|
||||||
|
theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
|
||||||
|
);
|
||||||
|
cache[i0 + 1] *= sin_sign;
|
||||||
|
|
||||||
|
theta *= theta_scale;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_CALL void ggml_rope_yarn_corr_dims(
|
||||||
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
||||||
) {
|
) {
|
||||||
// start and end correction dims
|
// start and end correction dims
|
||||||
|
@ -11720,6 +11721,12 @@ static void ggml_compute_forward_rope_f32(
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
||||||
const int64_t p = pos[i2];
|
const int64_t p = pos[i2];
|
||||||
|
|
||||||
|
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
||||||
|
if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
|
||||||
|
ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
||||||
|
}
|
||||||
|
|
||||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
||||||
if (ir++ < ir0) continue;
|
if (ir++ < ir0) continue;
|
||||||
if (ir > ir1) break;
|
if (ir > ir1) break;
|
||||||
|
@ -11753,18 +11760,13 @@ static void ggml_compute_forward_rope_f32(
|
||||||
}
|
}
|
||||||
} else if (!is_neox) {
|
} else if (!is_neox) {
|
||||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||||
float cos_theta, sin_theta;
|
const float cos_theta = cache[i0 + 0];
|
||||||
rope_yarn(
|
const float sin_theta = cache[i0 + 1];
|
||||||
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
|
||||||
);
|
|
||||||
sin_theta *= sin_sign;
|
|
||||||
|
|
||||||
// zeta scaling for xPos only:
|
// zeta scaling for xPos only:
|
||||||
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
||||||
if (xpos_down) zeta = 1.0f / zeta;
|
if (xpos_down) zeta = 1.0f / zeta;
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
|
||||||
|
|
||||||
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
@ -11888,6 +11890,12 @@ static void ggml_compute_forward_rope_f16(
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
||||||
const int64_t p = pos[i2];
|
const int64_t p = pos[i2];
|
||||||
|
|
||||||
|
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
||||||
|
if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
|
||||||
|
ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
||||||
|
}
|
||||||
|
|
||||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
||||||
if (ir++ < ir0) continue;
|
if (ir++ < ir0) continue;
|
||||||
if (ir > ir1) break;
|
if (ir > ir1) break;
|
||||||
|
@ -11921,13 +11929,8 @@ static void ggml_compute_forward_rope_f16(
|
||||||
}
|
}
|
||||||
} else if (!is_neox) {
|
} else if (!is_neox) {
|
||||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||||
float cos_theta, sin_theta;
|
const float cos_theta = cache[i0 + 0];
|
||||||
rope_yarn(
|
const float sin_theta = cache[i0 + 1];
|
||||||
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
|
||||||
);
|
|
||||||
sin_theta *= sin_sign;
|
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
|
||||||
|
|
||||||
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
@ -16722,6 +16725,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
|
case GGML_OP_ROPE:
|
||||||
{
|
{
|
||||||
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
||||||
} break;
|
} break;
|
||||||
|
@ -18520,6 +18524,28 @@ enum ggml_opt_result ggml_opt_resume_g(
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
void ggml_quantize_init(enum ggml_type type) {
|
||||||
|
ggml_critical_section_start();
|
||||||
|
|
||||||
|
switch (type) {
|
||||||
|
case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
|
||||||
|
case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
|
||||||
|
default: // nothing
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_critical_section_end();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_quantize_free(void) {
|
||||||
|
ggml_critical_section_start();
|
||||||
|
|
||||||
|
iq2xs_free_impl(256);
|
||||||
|
iq2xs_free_impl(512);
|
||||||
|
|
||||||
|
ggml_critical_section_end();
|
||||||
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
||||||
assert(k % QK4_0 == 0);
|
assert(k % QK4_0 == 0);
|
||||||
const int nb = k / QK4_0;
|
const int nb = k / QK4_0;
|
||||||
|
@ -18647,32 +18673,53 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
|
||||||
return (n/QK8_0*sizeof(block_q8_0));
|
return (n/QK8_0*sizeof(block_q8_0));
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
|
bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
||||||
|
return
|
||||||
|
type == GGML_TYPE_IQ2_XXS ||
|
||||||
|
type == GGML_TYPE_IQ2_XS;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
|
||||||
|
int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
|
||||||
|
ggml_quantize_init(type); // this is noop if already initialized
|
||||||
size_t result = 0;
|
size_t result = 0;
|
||||||
|
int n = nrows * n_per_row;
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK4_0 == 0);
|
GGML_ASSERT(start % QK4_0 == 0);
|
||||||
block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
result = ggml_quantize_q4_0(src + start, block, n, n, hist);
|
size_t start_row = start / n_per_row;
|
||||||
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
|
result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK4_1 == 0);
|
GGML_ASSERT(start % QK4_1 == 0);
|
||||||
block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
result = ggml_quantize_q4_1(src + start, block, n, n, hist);
|
size_t start_row = start / n_per_row;
|
||||||
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
|
result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_TYPE_Q5_0:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK5_0 == 0);
|
GGML_ASSERT(start % QK5_0 == 0);
|
||||||
block_q5_0 * block = (block_q5_0*)dst + start / QK5_0;
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
result = ggml_quantize_q5_0(src + start, block, n, n, hist);
|
size_t start_row = start / n_per_row;
|
||||||
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
|
result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK5_1 == 0);
|
GGML_ASSERT(start % QK5_1 == 0);
|
||||||
block_q5_1 * block = (block_q5_1*)dst + start / QK5_1;
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
result = ggml_quantize_q5_1(src + start, block, n, n, hist);
|
size_t start_row = start / n_per_row;
|
||||||
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
|
result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
{
|
{
|
||||||
|
@ -18683,54 +18730,77 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK_K == 0);
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
block_q2_K * block = (block_q2_K*)dst + start / QK_K;
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
result = ggml_quantize_q2_K(src + start, block, n, n, hist);
|
size_t start_row = start / n_per_row;
|
||||||
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
|
result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK_K == 0);
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
block_q3_K * block = (block_q3_K*)dst + start / QK_K;
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
result = ggml_quantize_q3_K(src + start, block, n, n, hist);
|
size_t start_row = start / n_per_row;
|
||||||
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
|
result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK_K == 0);
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
block_q4_K * block = (block_q4_K*)dst + start / QK_K;
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
result = ggml_quantize_q4_K(src + start, block, n, n, hist);
|
size_t start_row = start / n_per_row;
|
||||||
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
|
result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK_K == 0);
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
block_q5_K * block = (block_q5_K*)dst + start / QK_K;
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
result = ggml_quantize_q5_K(src + start, block, n, n, hist);
|
size_t start_row = start / n_per_row;
|
||||||
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
|
result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK_K == 0);
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
block_q6_K * block = (block_q6_K*)dst + start / QK_K;
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
result = ggml_quantize_q6_K(src + start, block, n, n, hist);
|
size_t start_row = start / n_per_row;
|
||||||
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
|
result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK_K == 0);
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K;
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
result = ggml_quantize_iq2_xxs(src + start, block, n, n, hist);
|
GGML_ASSERT(imatrix);
|
||||||
|
size_t start_row = start / n_per_row;
|
||||||
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
|
result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK_K == 0);
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K;
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
result = ggml_quantize_iq2_xs(src + start, block, n, n, hist);
|
GGML_ASSERT(imatrix);
|
||||||
|
size_t start_row = start / n_per_row;
|
||||||
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
|
result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
int elemsize = sizeof(ggml_fp16_t);
|
size_t elemsize = sizeof(ggml_fp16_t);
|
||||||
ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
|
ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
|
||||||
result = n * elemsize;
|
result = n * elemsize;
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
{
|
{
|
||||||
int elemsize = sizeof(float);
|
size_t elemsize = sizeof(float);
|
||||||
result = n * elemsize;
|
result = n * elemsize;
|
||||||
memcpy((uint8_t *)dst + start * elemsize, src + start, result);
|
memcpy((uint8_t *)dst + start * elemsize, src + start, result);
|
||||||
} break;
|
} break;
|
||||||
|
|
67
ggml.h
|
@ -187,6 +187,16 @@
|
||||||
# define GGML_API
|
# define GGML_API
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_MULTIPLATFORM
|
||||||
|
# if defined(_WIN32)
|
||||||
|
# define GGML_CALL
|
||||||
|
# else
|
||||||
|
# define GGML_CALL __attribute__((__ms_abi__))
|
||||||
|
# endif
|
||||||
|
#else
|
||||||
|
# define GGML_CALL
|
||||||
|
#endif
|
||||||
|
|
||||||
// TODO: support for clang
|
// TODO: support for clang
|
||||||
#ifdef __GNUC__
|
#ifdef __GNUC__
|
||||||
# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
||||||
|
@ -649,36 +659,36 @@ extern "C" {
|
||||||
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
||||||
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
||||||
|
|
||||||
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
||||||
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
||||||
|
|
||||||
GGML_API int ggml_blck_size(enum ggml_type type);
|
GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
|
||||||
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
||||||
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
||||||
|
|
||||||
GGML_DEPRECATED(
|
GGML_DEPRECATED(
|
||||||
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
||||||
"use ggml_row_size() instead");
|
"use ggml_row_size() instead");
|
||||||
|
|
||||||
GGML_API const char * ggml_type_name(enum ggml_type type);
|
GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
|
||||||
GGML_API const char * ggml_op_name (enum ggml_op op);
|
GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op);
|
||||||
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
||||||
|
|
||||||
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
||||||
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
||||||
|
|
||||||
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type);
|
||||||
|
|
||||||
// TODO: temporary until model loading of ggml examples is refactored
|
// TODO: temporary until model loading of ggml examples is refactored
|
||||||
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
||||||
|
|
||||||
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
||||||
|
@ -770,7 +780,7 @@ extern "C" {
|
||||||
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
||||||
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
||||||
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
||||||
|
@ -1413,7 +1423,7 @@ extern "C" {
|
||||||
float beta_slow);
|
float beta_slow);
|
||||||
|
|
||||||
// compute correction dims for YaRN RoPE scaling
|
// compute correction dims for YaRN RoPE scaling
|
||||||
void ggml_rope_yarn_corr_dims(
|
GGML_CALL void ggml_rope_yarn_corr_dims(
|
||||||
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
||||||
|
|
||||||
// xPos RoPE, in-place, returns view(a)
|
// xPos RoPE, in-place, returns view(a)
|
||||||
|
@ -2055,6 +2065,18 @@ extern "C" {
|
||||||
// quantization
|
// quantization
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// - ggml_quantize_init can be called multiple times with the same type
|
||||||
|
// it will only initialize the quantization tables for the first call or after ggml_quantize_free
|
||||||
|
// automatically called by ggml_quantize_chunk for convenience
|
||||||
|
//
|
||||||
|
// - ggml_quantize_free will free any memory allocated by ggml_quantize_init
|
||||||
|
// call this at the end of the program to avoid memory leaks
|
||||||
|
//
|
||||||
|
// note: these are thread-safe
|
||||||
|
//
|
||||||
|
GGML_API void ggml_quantize_init(enum ggml_type type);
|
||||||
|
GGML_API void ggml_quantize_free(void);
|
||||||
|
|
||||||
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
||||||
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
@ -2067,16 +2089,13 @@ extern "C" {
|
||||||
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
|
|
||||||
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
// some quantization type cannot be used without an importance matrix
|
||||||
|
GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
|
||||||
|
|
||||||
//
|
// calls ggml_quantize_init internally (i.e. can allocate memory)
|
||||||
// Importance matrix
|
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
|
||||||
//
|
int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
|
|
||||||
GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// gguf
|
// gguf
|
||||||
|
|
|
@ -97,8 +97,10 @@ class MODEL_ARCH(IntEnum):
|
||||||
BLOOM = auto()
|
BLOOM = auto()
|
||||||
STABLELM = auto()
|
STABLELM = auto()
|
||||||
QWEN = auto()
|
QWEN = auto()
|
||||||
|
QWEN2 = auto()
|
||||||
PHI2 = auto()
|
PHI2 = auto()
|
||||||
PLAMO = auto()
|
PLAMO = auto()
|
||||||
|
CODESHELL = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
|
@ -145,8 +147,10 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.BLOOM: "bloom",
|
MODEL_ARCH.BLOOM: "bloom",
|
||||||
MODEL_ARCH.STABLELM: "stablelm",
|
MODEL_ARCH.STABLELM: "stablelm",
|
||||||
MODEL_ARCH.QWEN: "qwen",
|
MODEL_ARCH.QWEN: "qwen",
|
||||||
|
MODEL_ARCH.QWEN2: "qwen2",
|
||||||
MODEL_ARCH.PHI2: "phi2",
|
MODEL_ARCH.PHI2: "phi2",
|
||||||
MODEL_ARCH.PLAMO: "plamo",
|
MODEL_ARCH.PLAMO: "plamo",
|
||||||
|
MODEL_ARCH.CODESHELL: "codeshell",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
|
@ -356,6 +360,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.QWEN2: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
MODEL_ARCH.PLAMO: [
|
MODEL_ARCH.PLAMO: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
@ -396,6 +414,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_NORM,
|
MODEL_TENSOR.FFN_NORM,
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.CODESHELL: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.POS_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
]
|
]
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
@ -417,6 +448,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.ROPE_FREQS,
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.CODESHELL: [
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|
|
@ -154,6 +154,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
||||||
"layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
|
"layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
|
||||||
"model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
|
"model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
|
||||||
|
"transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward norm
|
# Feed-forward norm
|
||||||
|
|
22
llama.h
|
@ -2,6 +2,7 @@
|
||||||
#define LLAMA_H
|
#define LLAMA_H
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
||||||
|
@ -231,6 +232,9 @@ extern "C" {
|
||||||
float yarn_beta_slow; // YaRN high correction dim
|
float yarn_beta_slow; // YaRN high correction dim
|
||||||
uint32_t yarn_orig_ctx; // YaRN original context size
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
||||||
|
|
||||||
|
ggml_backend_sched_eval_callback cb_eval;
|
||||||
|
void * cb_eval_user_data;
|
||||||
|
|
||||||
enum ggml_type type_k; // data type for K cache
|
enum ggml_type type_k; // data type for K cache
|
||||||
enum ggml_type type_v; // data type for V cache
|
enum ggml_type type_v; // data type for V cache
|
||||||
|
|
||||||
|
@ -249,6 +253,7 @@ extern "C" {
|
||||||
bool quantize_output_tensor; // quantize output.weight
|
bool quantize_output_tensor; // quantize output.weight
|
||||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||||
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
|
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
|
||||||
|
void * imatrix; // pointer to importance matrix data
|
||||||
} llama_model_quantize_params;
|
} llama_model_quantize_params;
|
||||||
|
|
||||||
// grammar types
|
// grammar types
|
||||||
|
@ -713,14 +718,21 @@ extern "C" {
|
||||||
float penalty_present);
|
float penalty_present);
|
||||||
|
|
||||||
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
||||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
/// @param logits Logits extracted from the original generation context.
|
||||||
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
/// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
||||||
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
/// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
||||||
LLAMA_API void llama_sample_classifier_free_guidance(
|
LLAMA_API void llama_sample_apply_guidance(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
float * logits,
|
||||||
|
float * logits_guidance,
|
||||||
|
float scale);
|
||||||
|
|
||||||
|
LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
llama_token_data_array * candidates,
|
llama_token_data_array * candidates,
|
||||||
struct llama_context * guidance_ctx,
|
struct llama_context * guidance_ctx,
|
||||||
float scale);
|
float scale),
|
||||||
|
"use llama_sample_apply_guidance() instead");
|
||||||
|
|
||||||
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
||||||
LLAMA_API void llama_sample_softmax(
|
LLAMA_API void llama_sample_softmax(
|
||||||
|
|
10
scripts/get-hellaswag.sh
Executable file
|
@ -0,0 +1,10 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
wget https://raw.githubusercontent.com/klosax/hellaswag_text_data/main/hellaswag_val_full.txt
|
||||||
|
|
||||||
|
echo "Usage:"
|
||||||
|
echo ""
|
||||||
|
echo " ./perplexity -m model.gguf -f hellaswag_val_full.txt --hellaswag [--hellaswag-tasks N] [other params]"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
exit 0
|
|
@ -1,3 +1,10 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
|
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
|
||||||
|
|
||||||
|
echo "Usage:"
|
||||||
|
echo ""
|
||||||
|
echo " ./perplexity -m model.gguf -f wiki.test.raw [other params]"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
exit 0
|
||||||
|
|
10
scripts/get-winogrande.sh
Executable file
|
@ -0,0 +1,10 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
wget https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-llama.cpp/raw/main/winogrande-debiased-eval.csv
|
||||||
|
|
||||||
|
echo "Usage:"
|
||||||
|
echo ""
|
||||||
|
echo " ./perplexity -m model.gguf -f winogrande-debiased-eval.csv --winogrande [--winogrande-tasks N] [other params]"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
exit 0
|
|
@ -5,7 +5,7 @@
|
||||||
# Usage:
|
# Usage:
|
||||||
#
|
#
|
||||||
# $ cd /path/to/llama.cpp
|
# $ cd /path/to/llama.cpp
|
||||||
# $ ./scripts/sync-ggml-am.sh
|
# $ ./scripts/sync-ggml-am.sh -skip hash0,hash1,hash2...
|
||||||
#
|
#
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
@ -24,6 +24,11 @@ fi
|
||||||
lc=$(cat $SRC_LLAMA/scripts/sync-ggml.last)
|
lc=$(cat $SRC_LLAMA/scripts/sync-ggml.last)
|
||||||
echo "Syncing ggml changes since commit $lc"
|
echo "Syncing ggml changes since commit $lc"
|
||||||
|
|
||||||
|
to_skip=""
|
||||||
|
if [ "$1" == "-skip" ]; then
|
||||||
|
to_skip=$2
|
||||||
|
fi
|
||||||
|
|
||||||
cd $SRC_GGML
|
cd $SRC_GGML
|
||||||
|
|
||||||
git log --oneline $lc..HEAD
|
git log --oneline $lc..HEAD
|
||||||
|
@ -40,6 +45,13 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||||
fi
|
fi
|
||||||
|
|
||||||
while read c; do
|
while read c; do
|
||||||
|
if [ -n "$to_skip" ]; then
|
||||||
|
if [[ $to_skip == *"$c"* ]]; then
|
||||||
|
echo "Skipping $c"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
git format-patch -k $c~1..$c --stdout -- \
|
git format-patch -k $c~1..$c --stdout -- \
|
||||||
include/ggml/ggml*.h \
|
include/ggml/ggml*.h \
|
||||||
src/ggml*.h \
|
src/ggml*.h \
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
400c07f00508e6f60fb25405444b5669c365b0a9
|
6c1ce0bd591a430c1d3f6797d905194581c878c1
|
||||||
|
|
|
@ -49,6 +49,7 @@ llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||||
llama_build_and_test_executable(test-grad0.cpp)
|
llama_build_and_test_executable(test-grad0.cpp)
|
||||||
# llama_build_and_test_executable(test-opt.cpp) # SLOW
|
# llama_build_and_test_executable(test-opt.cpp) # SLOW
|
||||||
llama_build_and_test_executable(test-backend-ops.cpp)
|
llama_build_and_test_executable(test-backend-ops.cpp)
|
||||||
|
llama_build_and_test_executable(test-autorelease.cpp)
|
||||||
|
|
||||||
llama_build_and_test_executable(test-rope.cpp)
|
llama_build_and_test_executable(test-rope.cpp)
|
||||||
|
|
||||||
|
|
28
tests/test-autorelease.cpp
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
// ref: https://github.com/ggerganov/llama.cpp/issues/4952#issuecomment-1892864763
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
// This creates a new context inside a pthread and then tries to exit cleanly.
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
if (argc < 2) {
|
||||||
|
printf("Usage: %s model.gguf\n", argv[0]);
|
||||||
|
return 0; // intentionally return success
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string fname = argv[1];
|
||||||
|
|
||||||
|
std::thread([&fname]() {
|
||||||
|
llama_backend_init(false);
|
||||||
|
auto * model = llama_load_model_from_file(fname.c_str(), llama_model_default_params());
|
||||||
|
auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
llama_backend_free();
|
||||||
|
}).join();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -16,39 +16,37 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
|
static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
|
||||||
|
// static RNG initialization (revisit if n_threads stops being constant)
|
||||||
|
static const size_t n_threads = std::thread::hardware_concurrency();
|
||||||
|
static std::vector<std::default_random_engine> generators = []() {
|
||||||
|
std::random_device rd;
|
||||||
|
std::vector<std::default_random_engine> vec;
|
||||||
|
vec.reserve(n_threads);
|
||||||
|
//for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
|
||||||
|
for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
|
||||||
|
return vec;
|
||||||
|
}();
|
||||||
|
|
||||||
size_t size = ggml_nelements(tensor);
|
size_t size = ggml_nelements(tensor);
|
||||||
std::vector<float> data(size);
|
std::vector<float> data(size);
|
||||||
|
|
||||||
#if 0
|
auto init_thread = [&](size_t ith, size_t start, size_t end) {
|
||||||
static std::default_random_engine generator(1234);
|
|
||||||
std::uniform_real_distribution<float> distribution(min, max);
|
std::uniform_real_distribution<float> distribution(min, max);
|
||||||
|
|
||||||
for (size_t i = 0; i < size; i++) {
|
|
||||||
data[i] = distribution(generator);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
auto init_thread = [&](size_t start, size_t end) {
|
|
||||||
std::random_device rd;
|
|
||||||
std::default_random_engine generator(rd());
|
|
||||||
std::uniform_real_distribution<float> distribution(min, max);
|
|
||||||
|
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
data[i] = distribution(generator);
|
data[i] = distribution(generators[ith]);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
size_t n_threads = std::thread::hardware_concurrency();
|
|
||||||
std::vector<std::thread> threads;
|
std::vector<std::thread> threads;
|
||||||
threads.reserve(n_threads);
|
threads.reserve(n_threads);
|
||||||
for (size_t i = 0; i < n_threads; i++) {
|
for (size_t i = 0; i < n_threads; i++) {
|
||||||
size_t start = i*size/n_threads;
|
size_t start = i*size/n_threads;
|
||||||
size_t end = (i+1)*size/n_threads;
|
size_t end = (i+1)*size/n_threads;
|
||||||
threads.emplace_back(init_thread, start, end);
|
threads.emplace_back(init_thread, i, start, end);
|
||||||
}
|
}
|
||||||
for (auto & t : threads) {
|
for (auto & t : threads) {
|
||||||
t.join();
|
t.join();
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
||||||
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
|
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
|
||||||
|
@ -56,7 +54,16 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
||||||
GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
|
GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
|
||||||
std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
|
std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
|
||||||
int64_t hist[16];
|
int64_t hist[16];
|
||||||
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size, hist);
|
std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
|
||||||
|
const float * im = imatrix.data();
|
||||||
|
if (!ggml_quantize_requires_imatrix(tensor->type)) {
|
||||||
|
// when the imatrix is optional, we want to test both quantization with and without imatrix
|
||||||
|
// use one of the random numbers to decide
|
||||||
|
if (data[0] > 0.5f*(min + max)) {
|
||||||
|
im = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], hist, im);
|
||||||
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
||||||
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
||||||
// This is going to create some weird integers though.
|
// This is going to create some weird integers though.
|
||||||
|
@ -1472,7 +1479,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||||
GGML_TYPE_Q8_0,
|
GGML_TYPE_Q8_0,
|
||||||
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
|
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
|
||||||
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
|
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
|
||||||
GGML_TYPE_Q6_K
|
GGML_TYPE_Q6_K,
|
||||||
|
GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
|
||||||
};
|
};
|
||||||
|
|
||||||
// unary ops
|
// unary ops
|
||||||
|
@ -1752,6 +1760,8 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_quantize_free();
|
||||||
|
|
||||||
printf("\033[1;32mOK\033[0m\n");
|
printf("\033[1;32mOK\033[0m\n");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|