Merge remote-tracking branch 'upstream/master'
This commit is contained in:
commit
1f32360659
32 changed files with 1580 additions and 1187 deletions
|
@ -225,6 +225,9 @@ effectiveStdenv.mkDerivation (
|
||||||
description = "contains numpy and sentencepiece";
|
description = "contains numpy and sentencepiece";
|
||||||
buildInputs = [ llama-python ];
|
buildInputs = [ llama-python ];
|
||||||
inputsFrom = [ finalAttrs.finalPackage ];
|
inputsFrom = [ finalAttrs.finalPackage ];
|
||||||
|
shellHook = ''
|
||||||
|
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
|
||||||
|
'';
|
||||||
};
|
};
|
||||||
|
|
||||||
shell-extra = mkShell {
|
shell-extra = mkShell {
|
||||||
|
|
12
.github/workflows/build.yml
vendored
12
.github/workflows/build.yml
vendored
|
@ -72,7 +72,7 @@ jobs:
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest --verbose --timeout 900
|
ctest -L main --verbose --timeout 900
|
||||||
|
|
||||||
ubuntu-latest-cmake-sanitizer:
|
ubuntu-latest-cmake-sanitizer:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
@ -107,7 +107,7 @@ jobs:
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest --verbose --timeout 900
|
ctest -L main --verbose --timeout 900
|
||||||
|
|
||||||
ubuntu-latest-cmake-mpi:
|
ubuntu-latest-cmake-mpi:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
@ -141,7 +141,7 @@ jobs:
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest --verbose
|
ctest -L main --verbose
|
||||||
|
|
||||||
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||||
# how to debug it.
|
# how to debug it.
|
||||||
|
@ -202,7 +202,7 @@ jobs:
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest --verbose --timeout 900
|
ctest -L main --verbose --timeout 900
|
||||||
|
|
||||||
macOS-latest-cmake-ios:
|
macOS-latest-cmake-ios:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
@ -394,7 +394,7 @@ jobs:
|
||||||
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
|
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -C Release --verbose --timeout 900
|
ctest -L main -C Release --verbose --timeout 900
|
||||||
|
|
||||||
- name: Test (Intel SDE)
|
- name: Test (Intel SDE)
|
||||||
id: cmake_test_sde
|
id: cmake_test_sde
|
||||||
|
@ -406,7 +406,7 @@ jobs:
|
||||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
||||||
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
||||||
cd build
|
cd build
|
||||||
& $sde -future -- ctest -C Release --verbose --timeout 900
|
& $sde -future -- ctest -L main -C Release --verbose --timeout 900
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
id: tag
|
id: tag
|
||||||
|
|
19
.gitignore
vendored
19
.gitignore
vendored
|
@ -27,7 +27,7 @@
|
||||||
lcov-report/
|
lcov-report/
|
||||||
gcovr-report/
|
gcovr-report/
|
||||||
|
|
||||||
build*/
|
build*
|
||||||
out/
|
out/
|
||||||
tmp/
|
tmp/
|
||||||
|
|
||||||
|
@ -89,20 +89,3 @@ examples/jeopardy/results.txt
|
||||||
|
|
||||||
poetry.lock
|
poetry.lock
|
||||||
poetry.toml
|
poetry.toml
|
||||||
|
|
||||||
# Test binaries
|
|
||||||
/tests/test-grammar-parser
|
|
||||||
/tests/test-llama-grammar
|
|
||||||
/tests/test-double-float
|
|
||||||
/tests/test-grad0
|
|
||||||
/tests/test-opt
|
|
||||||
/tests/test-quantize-fns
|
|
||||||
/tests/test-quantize-perf
|
|
||||||
/tests/test-sampling
|
|
||||||
/tests/test-tokenizer-0-llama
|
|
||||||
/tests/test-tokenizer-0-falcon
|
|
||||||
/tests/test-tokenizer-1-llama
|
|
||||||
/tests/test-tokenizer-1-bpe
|
|
||||||
/tests/test-rope
|
|
||||||
/tests/test-backend-ops
|
|
||||||
/tests/test-autorelease
|
|
||||||
|
|
|
@ -466,17 +466,17 @@ function(get_flags CCID CCVER)
|
||||||
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
|
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
|
||||||
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
|
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
|
||||||
)
|
)
|
||||||
set(C_FLAGS ${C_FLAGS} -Wdouble-promotion)
|
list(APPEND C_FLAGS -Wdouble-promotion)
|
||||||
endif()
|
endif()
|
||||||
elseif (CCID STREQUAL "GNU")
|
elseif (CCID STREQUAL "GNU")
|
||||||
set(C_FLAGS -Wdouble-promotion)
|
set(C_FLAGS -Wdouble-promotion)
|
||||||
set(CXX_FLAGS -Wno-array-bounds)
|
set(CXX_FLAGS -Wno-array-bounds)
|
||||||
|
|
||||||
if (CCVER VERSION_GREATER_EQUAL 7.1.0)
|
if (CCVER VERSION_GREATER_EQUAL 7.1.0)
|
||||||
set(CXX_FLAGS ${CXX_FLAGS} -Wno-format-truncation)
|
list(APPEND CXX_FLAGS -Wno-format-truncation)
|
||||||
endif()
|
endif()
|
||||||
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
||||||
set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
|
list(APPEND CXX_FLAGS -Wextra-semi)
|
||||||
endif()
|
endif()
|
||||||
elseif (CCID MATCHES "Intel")
|
elseif (CCID MATCHES "Intel")
|
||||||
# enable max optimization level when using Intel compiler
|
# enable max optimization level when using Intel compiler
|
||||||
|
@ -510,16 +510,18 @@ if (LLAMA_ALL_WARNINGS)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
set(CUDA_CXX_FLAGS "")
|
||||||
|
|
||||||
if (LLAMA_CUBLAS)
|
if (LLAMA_CUBLAS)
|
||||||
set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
|
set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
set(CUDA_FLAGS ${CUDA_FLAGS} -Wno-pedantic)
|
list(APPEND CUDA_FLAGS -Wno-pedantic)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_ALL_WARNINGS AND NOT MSVC)
|
if (LLAMA_ALL_WARNINGS AND NOT MSVC)
|
||||||
set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
|
set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
|
||||||
if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
|
if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
|
||||||
set(NVCC_CMD ${NVCC_CMD} -ccbin ${CMAKE_CUDA_HOST_COMPILER})
|
list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
execute_process(
|
execute_process(
|
||||||
|
@ -547,13 +549,8 @@ if (LLAMA_CUBLAS)
|
||||||
message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
|
message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
|
||||||
|
|
||||||
get_flags(${CUDA_CCID} ${CUDA_CCVER})
|
get_flags(${CUDA_CCID} ${CUDA_CCVER})
|
||||||
list(JOIN GF_CXX_FLAGS " " CUDA_CXX_FLAGS) # pass host compiler flags as a single argument
|
list(APPEND CUDA_CXX_FLAGS ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later
|
||||||
if (NOT CUDA_CXX_FLAGS STREQUAL "")
|
|
||||||
set(CUDA_FLAGS ${CUDA_FLAGS} -Xcompiler ${CUDA_CXX_FLAGS})
|
|
||||||
endif()
|
endif()
|
||||||
endif()
|
|
||||||
|
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
|
@ -618,12 +615,7 @@ if (NOT MSVC)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
function(add_compile_option_cpp ARG)
|
set(ARCH_FLAGS "")
|
||||||
# Adds a compile option to C/C++ only, but not for Cuda.
|
|
||||||
# Use, e.g., for CPU-architecture flags.
|
|
||||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:${ARG}>)
|
|
||||||
add_compile_options($<$<COMPILE_LANGUAGE:C>:${ARG}>)
|
|
||||||
endfunction()
|
|
||||||
|
|
||||||
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
|
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
|
||||||
message(STATUS "ARM detected")
|
message(STATUS "ARM detected")
|
||||||
|
@ -636,19 +628,19 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
|
||||||
else()
|
else()
|
||||||
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
||||||
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
||||||
add_compile_options(-mfp16-format=ieee)
|
list(APPEND ARCH_FLAGS -mfp16-format=ieee)
|
||||||
endif()
|
endif()
|
||||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
|
||||||
# Raspberry Pi 1, Zero
|
# Raspberry Pi 1, Zero
|
||||||
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access)
|
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
|
||||||
endif()
|
endif()
|
||||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
||||||
# Raspberry Pi 2
|
# Raspberry Pi 2
|
||||||
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||||
endif()
|
endif()
|
||||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
||||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||||
add_compile_options(-mno-unaligned-access)
|
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
|
||||||
|
@ -659,7 +651,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
|
||||||
include(cmake/FindSIMD.cmake)
|
include(cmake/FindSIMD.cmake)
|
||||||
endif ()
|
endif ()
|
||||||
if (LLAMA_AVX512)
|
if (LLAMA_AVX512)
|
||||||
add_compile_option_cpp(/arch:AVX512)
|
list(APPEND ARCH_FLAGS /arch:AVX512)
|
||||||
# MSVC has no compile-time flags enabling specific
|
# MSVC has no compile-time flags enabling specific
|
||||||
# AVX512 extensions, neither it defines the
|
# AVX512 extensions, neither it defines the
|
||||||
# macros corresponding to the extensions.
|
# macros corresponding to the extensions.
|
||||||
|
@ -673,49 +665,61 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
||||||
endif()
|
endif()
|
||||||
elseif (LLAMA_AVX2)
|
elseif (LLAMA_AVX2)
|
||||||
add_compile_option_cpp(/arch:AVX2)
|
list(APPEND ARCH_FLAGS /arch:AVX2)
|
||||||
elseif (LLAMA_AVX)
|
elseif (LLAMA_AVX)
|
||||||
add_compile_option_cpp(/arch:AVX)
|
list(APPEND ARCH_FLAGS /arch:AVX)
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
if (LLAMA_NATIVE)
|
if (LLAMA_NATIVE)
|
||||||
add_compile_option_cpp(-march=native)
|
list(APPEND ARCH_FLAGS -march=native)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_F16C)
|
if (LLAMA_F16C)
|
||||||
add_compile_option_cpp(-mf16c)
|
list(APPEND ARCH_FLAGS -mf16c)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_FMA)
|
if (LLAMA_FMA)
|
||||||
add_compile_option_cpp(-mfma)
|
list(APPEND ARCH_FLAGS -mfma)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_AVX)
|
if (LLAMA_AVX)
|
||||||
add_compile_option_cpp(-mavx)
|
list(APPEND ARCH_FLAGS -mavx)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_AVX2)
|
if (LLAMA_AVX2)
|
||||||
add_compile_option_cpp(-mavx2)
|
list(APPEND ARCH_FLAGS -mavx2)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_AVX512)
|
if (LLAMA_AVX512)
|
||||||
add_compile_option_cpp(-mavx512f)
|
list(APPEND ARCH_FLAGS -mavx512f)
|
||||||
add_compile_option_cpp(-mavx512bw)
|
list(APPEND ARCH_FLAGS -mavx512bw)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_AVX512_VBMI)
|
if (LLAMA_AVX512_VBMI)
|
||||||
add_compile_option_cpp(-mavx512vbmi)
|
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_AVX512_VNNI)
|
if (LLAMA_AVX512_VNNI)
|
||||||
add_compile_option_cpp(-mavx512vnni)
|
list(APPEND ARCH_FLAGS -mavx512vnni)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||||
message(STATUS "PowerPC detected")
|
message(STATUS "PowerPC detected")
|
||||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||||
add_compile_options(-mcpu=powerpc64le)
|
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
||||||
else()
|
else()
|
||||||
add_compile_options(-mcpu=native -mtune=native)
|
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
|
||||||
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
message(STATUS "Unknown architecture")
|
message(STATUS "Unknown architecture")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
|
||||||
|
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
|
||||||
|
|
||||||
|
if (LLAMA_CUBLAS)
|
||||||
|
list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
|
||||||
|
list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
|
||||||
|
if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
|
||||||
|
list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
|
||||||
|
endif()
|
||||||
|
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
|
||||||
|
endif()
|
||||||
|
|
||||||
if (MINGW)
|
if (MINGW)
|
||||||
# Target Windows 8 for PrefetchVirtualMemory
|
# Target Windows 8 for PrefetchVirtualMemory
|
||||||
add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})
|
add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})
|
||||||
|
|
9
Makefile
9
Makefile
|
@ -9,7 +9,7 @@ TEST_TARGETS = \
|
||||||
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
||||||
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
||||||
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
||||||
tests/test-backend-ops tests/test-autorelease
|
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
|
||||||
|
|
||||||
# Code coverage output files
|
# Code coverage output files
|
||||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||||
|
@ -619,7 +619,7 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
|
||||||
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
|
||||||
|
|
||||||
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
||||||
|
@ -748,5 +748,8 @@ tests/test-c.o: tests/test-c.c llama.h
|
||||||
tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
|
tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
|
@ -10,11 +10,11 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||||
|
|
||||||
### Hot topics
|
### Hot topics
|
||||||
|
|
||||||
|
- ⚠️ Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138
|
||||||
- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
|
- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
|
||||||
- Collecting Apple Silicon performance stats:
|
- Collecting Apple Silicon performance stats:
|
||||||
- M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
|
- M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
|
||||||
- A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
|
- A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
|
||||||
- Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
|
|
||||||
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
|
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
|
||||||
|
|
||||||
----
|
----
|
||||||
|
@ -112,6 +112,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||||
- [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
|
- [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
|
||||||
- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
|
- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
|
||||||
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
|
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
|
||||||
|
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
|
||||||
|
|
||||||
|
|
||||||
**Bindings:**
|
**Bindings:**
|
||||||
|
|
81
ci/run.sh
81
ci/run.sh
|
@ -22,9 +22,9 @@ mkdir -p "$2"
|
||||||
OUT=$(realpath "$1")
|
OUT=$(realpath "$1")
|
||||||
MNT=$(realpath "$2")
|
MNT=$(realpath "$2")
|
||||||
|
|
||||||
rm -v $OUT/*.log
|
rm -f "$OUT/*.log"
|
||||||
rm -v $OUT/*.exit
|
rm -f "$OUT/*.exit"
|
||||||
rm -v $OUT/*.md
|
rm -f "$OUT/*.md"
|
||||||
|
|
||||||
sd=`dirname $0`
|
sd=`dirname $0`
|
||||||
cd $sd/../
|
cd $sd/../
|
||||||
|
@ -94,7 +94,7 @@ function gg_run_ctest_debug {
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
@ -123,9 +123,9 @@ function gg_run_ctest_release {
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
else
|
else
|
||||||
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
fi
|
fi
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
|
@ -141,6 +141,61 @@ function gg_sum_ctest_release {
|
||||||
gg_printf '```\n'
|
gg_printf '```\n'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function gg_get_model {
|
||||||
|
local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
|
||||||
|
local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
||||||
|
if [[ -s $gguf_3b ]]; then
|
||||||
|
echo -n "$gguf_3b"
|
||||||
|
elif [[ -s $gguf_7b ]]; then
|
||||||
|
echo -n "$gguf_7b"
|
||||||
|
else
|
||||||
|
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_run_ctest_with_model_debug {
|
||||||
|
cd ${SRC}
|
||||||
|
|
||||||
|
local model; model=$(gg_get_model)
|
||||||
|
cd build-ci-debug
|
||||||
|
set -e
|
||||||
|
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
|
set +e
|
||||||
|
cd ..
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_run_ctest_with_model_release {
|
||||||
|
cd ${SRC}
|
||||||
|
|
||||||
|
local model; model=$(gg_get_model)
|
||||||
|
cd build-ci-release
|
||||||
|
set -e
|
||||||
|
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
|
set +e
|
||||||
|
cd ..
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_sum_ctest_with_model_debug {
|
||||||
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
|
gg_printf 'Runs ctest with model files in debug mode\n'
|
||||||
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
|
gg_printf '```\n'
|
||||||
|
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
||||||
|
gg_printf '```\n'
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_sum_ctest_with_model_release {
|
||||||
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
|
gg_printf 'Runs ctest with model files in release mode\n'
|
||||||
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
|
gg_printf '```\n'
|
||||||
|
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
||||||
|
gg_printf '```\n'
|
||||||
|
}
|
||||||
|
|
||||||
# open_llama_3b_v2
|
# open_llama_3b_v2
|
||||||
|
|
||||||
function gg_run_open_llama_3b_v2 {
|
function gg_run_open_llama_3b_v2 {
|
||||||
|
@ -183,8 +238,6 @@ function gg_run_open_llama_3b_v2 {
|
||||||
|
|
||||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||||
|
|
||||||
./bin/test-autorelease ${model_f16}
|
|
||||||
|
|
||||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
|
@ -507,14 +560,18 @@ function gg_sum_open_llama_7b_v2 {
|
||||||
## main
|
## main
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
|
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
|
||||||
rm -rf ${SRC}/models-mnt
|
rm -rf ${SRC}/models-mnt
|
||||||
|
|
||||||
mnt_models=${MNT}/models
|
mnt_models=${MNT}/models
|
||||||
mkdir -p ${mnt_models}
|
mkdir -p ${mnt_models}
|
||||||
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
||||||
|
|
||||||
python3 -m pip install -r ${SRC}/requirements.txt
|
# Create a fresh python3 venv and enter it
|
||||||
python3 -m pip install --editable gguf-py
|
python3 -m venv "$MNT/venv"
|
||||||
|
source "$MNT/venv/bin/activate"
|
||||||
|
|
||||||
|
pip install -r ${SRC}/requirements.txt --disable-pip-version-check
|
||||||
|
pip install --editable gguf-py --disable-pip-version-check
|
||||||
fi
|
fi
|
||||||
|
|
||||||
ret=0
|
ret=0
|
||||||
|
@ -529,6 +586,8 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
else
|
else
|
||||||
test $ret -eq 0 && gg_run open_llama_7b_v2
|
test $ret -eq 0 && gg_run open_llama_7b_v2
|
||||||
fi
|
fi
|
||||||
|
test $ret -eq 0 && gg_run ctest_with_model_debug
|
||||||
|
test $ret -eq 0 && gg_run ctest_with_model_release
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
|
@ -129,6 +129,8 @@ static void sampler_queue(
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||||
|
|
||||||
const float temp = params.temp;
|
const float temp = params.temp;
|
||||||
|
const float dynatemp_range = params.dynatemp_range;
|
||||||
|
const float dynatemp_exponent = params.dynatemp_exponent;
|
||||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||||
const float top_p = params.top_p;
|
const float top_p = params.top_p;
|
||||||
const float min_p = params.min_p;
|
const float min_p = params.min_p;
|
||||||
|
@ -143,7 +145,15 @@ static void sampler_queue(
|
||||||
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
||||||
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
||||||
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
||||||
case 't': llama_sample_temp (ctx_main, &cur_p, temp); break;
|
case 't':
|
||||||
|
if (dynatemp_range > 0) {
|
||||||
|
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
|
||||||
|
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
|
||||||
|
llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
|
||||||
|
} else {
|
||||||
|
llama_sample_temp(ctx_main, &cur_p, temp);
|
||||||
|
}
|
||||||
|
break;
|
||||||
default : break;
|
default : break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,8 @@ typedef struct llama_sampling_params {
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
float typical_p = 1.00f; // 1.0 = disabled
|
float typical_p = 1.00f; // 1.0 = disabled
|
||||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||||
|
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||||
|
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||||
float penalty_repeat = 1.10f; // 1.0 = disabled
|
float penalty_repeat = 1.10f; // 1.0 = disabled
|
||||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||||
|
|
|
@ -30,6 +30,7 @@ android {
|
||||||
}
|
}
|
||||||
externalNativeBuild {
|
externalNativeBuild {
|
||||||
cmake {
|
cmake {
|
||||||
|
arguments += "-DCMAKE_BUILD_TYPE=Release"
|
||||||
cppFlags += listOf()
|
cppFlags += listOf()
|
||||||
arguments += listOf()
|
arguments += listOf()
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
# Function calling example using pydantic models.
|
# Function calling example using pydantic models.
|
||||||
import datetime
|
import datetime
|
||||||
|
import importlib
|
||||||
import json
|
import json
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Union, Optional
|
from typing import Optional, Union
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert_dictionary_to_pydantic_model,
|
||||||
import importlib
|
create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
|
||||||
from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation, convert_dictionary_to_pydantic_model, add_run_method_to_dynamic_model, create_dynamic_model_from_function
|
|
||||||
|
|
||||||
|
|
||||||
# Function to get completion on the llama.cpp server with grammar.
|
# Function to get completion on the llama.cpp server with grammar.
|
||||||
|
@ -35,7 +35,7 @@ class SendMessageToUser(BaseModel):
|
||||||
print(self.message)
|
print(self.message)
|
||||||
|
|
||||||
|
|
||||||
# Enum for the calculator function.
|
# Enum for the calculator tool.
|
||||||
class MathOperation(Enum):
|
class MathOperation(Enum):
|
||||||
ADD = "add"
|
ADD = "add"
|
||||||
SUBTRACT = "subtract"
|
SUBTRACT = "subtract"
|
||||||
|
@ -43,7 +43,7 @@ class MathOperation(Enum):
|
||||||
DIVIDE = "divide"
|
DIVIDE = "divide"
|
||||||
|
|
||||||
|
|
||||||
# Very simple calculator tool for the agent.
|
# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
|
||||||
class Calculator(BaseModel):
|
class Calculator(BaseModel):
|
||||||
"""
|
"""
|
||||||
Perform a math operation on two numbers.
|
Perform a math operation on two numbers.
|
||||||
|
@ -148,37 +148,6 @@ def get_current_datetime(output_format: Optional[str] = None):
|
||||||
return datetime.datetime.now().strftime(output_format)
|
return datetime.datetime.now().strftime(output_format)
|
||||||
|
|
||||||
|
|
||||||
# Enum for the calculator tool.
|
|
||||||
class MathOperation(Enum):
|
|
||||||
ADD = "add"
|
|
||||||
SUBTRACT = "subtract"
|
|
||||||
MULTIPLY = "multiply"
|
|
||||||
DIVIDE = "divide"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
|
|
||||||
class Calculator(BaseModel):
|
|
||||||
"""
|
|
||||||
Perform a math operation on two numbers.
|
|
||||||
"""
|
|
||||||
number_one: Union[int, float] = Field(..., description="First number.")
|
|
||||||
operation: MathOperation = Field(..., description="Math operation to perform.")
|
|
||||||
number_two: Union[int, float] = Field(..., description="Second number.")
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
if self.operation == MathOperation.ADD:
|
|
||||||
return self.number_one + self.number_two
|
|
||||||
elif self.operation == MathOperation.SUBTRACT:
|
|
||||||
return self.number_one - self.number_two
|
|
||||||
elif self.operation == MathOperation.MULTIPLY:
|
|
||||||
return self.number_one * self.number_two
|
|
||||||
elif self.operation == MathOperation.DIVIDE:
|
|
||||||
return self.number_one / self.number_two
|
|
||||||
else:
|
|
||||||
raise ValueError("Unknown operation.")
|
|
||||||
|
|
||||||
|
|
||||||
# Example function to get the weather
|
# Example function to get the weather
|
||||||
def get_current_weather(location, unit):
|
def get_current_weather(location, unit):
|
||||||
"""Get the current weather in a given location"""
|
"""Get the current weather in a given location"""
|
||||||
|
|
|
@ -1,15 +1,21 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import inspect
|
import inspect
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
from copy import copy
|
from copy import copy
|
||||||
from inspect import isclass, getdoc
|
from enum import Enum
|
||||||
from types import NoneType
|
from inspect import getdoc, isclass
|
||||||
|
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints
|
||||||
|
|
||||||
from docstring_parser import parse
|
from docstring_parser import parse
|
||||||
from pydantic import BaseModel, create_model, Field
|
from pydantic import BaseModel, Field, create_model
|
||||||
from typing import Any, Type, List, get_args, get_origin, Tuple, Union, Optional, _GenericAlias
|
|
||||||
from enum import Enum
|
if TYPE_CHECKING:
|
||||||
from typing import get_type_hints, Callable
|
from types import GenericAlias
|
||||||
import re
|
else:
|
||||||
|
# python 3.8 compat
|
||||||
|
from typing import _GenericAlias as GenericAlias
|
||||||
|
|
||||||
|
|
||||||
class PydanticDataType(Enum):
|
class PydanticDataType(Enum):
|
||||||
|
@ -43,7 +49,7 @@ class PydanticDataType(Enum):
|
||||||
SET = "set"
|
SET = "set"
|
||||||
|
|
||||||
|
|
||||||
def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str:
|
def map_pydantic_type_to_gbnf(pydantic_type: type[Any]) -> str:
|
||||||
if isclass(pydantic_type) and issubclass(pydantic_type, str):
|
if isclass(pydantic_type) and issubclass(pydantic_type, str):
|
||||||
return PydanticDataType.STRING.value
|
return PydanticDataType.STRING.value
|
||||||
elif isclass(pydantic_type) and issubclass(pydantic_type, bool):
|
elif isclass(pydantic_type) and issubclass(pydantic_type, bool):
|
||||||
|
@ -57,22 +63,22 @@ def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str:
|
||||||
|
|
||||||
elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel):
|
elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel):
|
||||||
return format_model_and_field_name(pydantic_type.__name__)
|
return format_model_and_field_name(pydantic_type.__name__)
|
||||||
elif get_origin(pydantic_type) == list:
|
elif get_origin(pydantic_type) is list:
|
||||||
element_type = get_args(pydantic_type)[0]
|
element_type = get_args(pydantic_type)[0]
|
||||||
return f"{map_pydantic_type_to_gbnf(element_type)}-list"
|
return f"{map_pydantic_type_to_gbnf(element_type)}-list"
|
||||||
elif get_origin(pydantic_type) == set:
|
elif get_origin(pydantic_type) is set:
|
||||||
element_type = get_args(pydantic_type)[0]
|
element_type = get_args(pydantic_type)[0]
|
||||||
return f"{map_pydantic_type_to_gbnf(element_type)}-set"
|
return f"{map_pydantic_type_to_gbnf(element_type)}-set"
|
||||||
elif get_origin(pydantic_type) == Union:
|
elif get_origin(pydantic_type) is Union:
|
||||||
union_types = get_args(pydantic_type)
|
union_types = get_args(pydantic_type)
|
||||||
union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types]
|
union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types]
|
||||||
return f"union-{'-or-'.join(union_rules)}"
|
return f"union-{'-or-'.join(union_rules)}"
|
||||||
elif get_origin(pydantic_type) == Optional:
|
elif get_origin(pydantic_type) is Optional:
|
||||||
element_type = get_args(pydantic_type)[0]
|
element_type = get_args(pydantic_type)[0]
|
||||||
return f"optional-{map_pydantic_type_to_gbnf(element_type)}"
|
return f"optional-{map_pydantic_type_to_gbnf(element_type)}"
|
||||||
elif isclass(pydantic_type):
|
elif isclass(pydantic_type):
|
||||||
return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}"
|
return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}"
|
||||||
elif get_origin(pydantic_type) == dict:
|
elif get_origin(pydantic_type) is dict:
|
||||||
key_type, value_type = get_args(pydantic_type)
|
key_type, value_type = get_args(pydantic_type)
|
||||||
return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}"
|
return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}"
|
||||||
else:
|
else:
|
||||||
|
@ -106,7 +112,6 @@ def get_members_structure(cls, rule_name):
|
||||||
return f"{cls.__name__.lower()} ::= " + " | ".join(members)
|
return f"{cls.__name__.lower()} ::= " + " | ".join(members)
|
||||||
if cls.__annotations__ and cls.__annotations__ != {}:
|
if cls.__annotations__ and cls.__annotations__ != {}:
|
||||||
result = f'{rule_name} ::= "{{"'
|
result = f'{rule_name} ::= "{{"'
|
||||||
type_list_rules = []
|
|
||||||
# Modify this comprehension
|
# Modify this comprehension
|
||||||
members = [
|
members = [
|
||||||
f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param_type)}'
|
f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param_type)}'
|
||||||
|
@ -116,17 +121,15 @@ def get_members_structure(cls, rule_name):
|
||||||
|
|
||||||
result += '"," '.join(members)
|
result += '"," '.join(members)
|
||||||
result += ' "}"'
|
result += ' "}"'
|
||||||
return result, type_list_rules
|
return result
|
||||||
elif rule_name == "custom-class-any":
|
if rule_name == "custom-class-any":
|
||||||
result = f"{rule_name} ::= "
|
result = f"{rule_name} ::= "
|
||||||
result += "value"
|
result += "value"
|
||||||
type_list_rules = []
|
return result
|
||||||
return result, type_list_rules
|
|
||||||
else:
|
|
||||||
init_signature = inspect.signature(cls.__init__)
|
init_signature = inspect.signature(cls.__init__)
|
||||||
parameters = init_signature.parameters
|
parameters = init_signature.parameters
|
||||||
result = f'{rule_name} ::= "{{"'
|
result = f'{rule_name} ::= "{{"'
|
||||||
type_list_rules = []
|
|
||||||
# Modify this comprehension too
|
# Modify this comprehension too
|
||||||
members = [
|
members = [
|
||||||
f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param.annotation)}'
|
f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param.annotation)}'
|
||||||
|
@ -136,7 +139,7 @@ def get_members_structure(cls, rule_name):
|
||||||
|
|
||||||
result += '", "'.join(members)
|
result += '", "'.join(members)
|
||||||
result += ' "}"'
|
result += ' "}"'
|
||||||
return result, type_list_rules
|
return result
|
||||||
|
|
||||||
|
|
||||||
def regex_to_gbnf(regex_pattern: str) -> str:
|
def regex_to_gbnf(regex_pattern: str) -> str:
|
||||||
|
@ -269,7 +272,7 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None
|
||||||
|
|
||||||
def generate_gbnf_rule_for_type(
|
def generate_gbnf_rule_for_type(
|
||||||
model_name, field_name, field_type, is_optional, processed_models, created_rules, field_info=None
|
model_name, field_name, field_type, is_optional, processed_models, created_rules, field_info=None
|
||||||
) -> Tuple[str, list]:
|
) -> tuple[str, list[str]]:
|
||||||
"""
|
"""
|
||||||
Generate GBNF rule for a given field type.
|
Generate GBNF rule for a given field type.
|
||||||
|
|
||||||
|
@ -283,7 +286,7 @@ def generate_gbnf_rule_for_type(
|
||||||
:param field_info: Additional information about the field (optional).
|
:param field_info: Additional information about the field (optional).
|
||||||
|
|
||||||
:return: Tuple containing the GBNF type and a list of additional rules.
|
:return: Tuple containing the GBNF type and a list of additional rules.
|
||||||
:rtype: Tuple[str, list]
|
:rtype: tuple[str, list]
|
||||||
"""
|
"""
|
||||||
rules = []
|
rules = []
|
||||||
|
|
||||||
|
@ -321,8 +324,7 @@ def generate_gbnf_rule_for_type(
|
||||||
gbnf_type, rules = model_name + "-" + field_name, rules
|
gbnf_type, rules = model_name + "-" + field_name, rules
|
||||||
|
|
||||||
elif gbnf_type.startswith("custom-class-"):
|
elif gbnf_type.startswith("custom-class-"):
|
||||||
nested_model_rules, field_types = get_members_structure(field_type, gbnf_type)
|
rules.append(get_members_structure(field_type, gbnf_type))
|
||||||
rules.append(nested_model_rules)
|
|
||||||
elif gbnf_type.startswith("custom-dict-"):
|
elif gbnf_type.startswith("custom-dict-"):
|
||||||
key_type, value_type = get_args(field_type)
|
key_type, value_type = get_args(field_type)
|
||||||
|
|
||||||
|
@ -341,14 +343,14 @@ def generate_gbnf_rule_for_type(
|
||||||
union_rules = []
|
union_rules = []
|
||||||
|
|
||||||
for union_type in union_types:
|
for union_type in union_types:
|
||||||
if isinstance(union_type, _GenericAlias):
|
if isinstance(union_type, GenericAlias):
|
||||||
union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
|
union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
|
||||||
model_name, field_name, union_type, False, processed_models, created_rules
|
model_name, field_name, union_type, False, processed_models, created_rules
|
||||||
)
|
)
|
||||||
union_rules.append(union_gbnf_type)
|
union_rules.append(union_gbnf_type)
|
||||||
rules.extend(union_rules_list)
|
rules.extend(union_rules_list)
|
||||||
|
|
||||||
elif not issubclass(union_type, NoneType):
|
elif not issubclass(union_type, type(None)):
|
||||||
union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
|
union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
|
||||||
model_name, field_name, union_type, False, processed_models, created_rules
|
model_name, field_name, union_type, False, processed_models, created_rules
|
||||||
)
|
)
|
||||||
|
@ -424,14 +426,10 @@ def generate_gbnf_rule_for_type(
|
||||||
else:
|
else:
|
||||||
gbnf_type, rules = gbnf_type, []
|
gbnf_type, rules = gbnf_type, []
|
||||||
|
|
||||||
if gbnf_type not in created_rules:
|
|
||||||
return gbnf_type, rules
|
|
||||||
else:
|
|
||||||
if gbnf_type in created_rules:
|
|
||||||
return gbnf_type, rules
|
return gbnf_type, rules
|
||||||
|
|
||||||
|
|
||||||
def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created_rules: dict) -> (list, bool, bool):
|
def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[BaseModel]], created_rules: dict[str, list[str]]) -> tuple[list[str], bool]:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
Generate GBnF Grammar
|
Generate GBnF Grammar
|
||||||
|
@ -452,7 +450,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
if model in processed_models:
|
if model in processed_models:
|
||||||
return []
|
return [], False
|
||||||
|
|
||||||
processed_models.add(model)
|
processed_models.add(model)
|
||||||
model_name = format_model_and_field_name(model.__name__)
|
model_name = format_model_and_field_name(model.__name__)
|
||||||
|
@ -518,7 +516,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created
|
||||||
|
|
||||||
|
|
||||||
def generate_gbnf_grammar_from_pydantic_models(
|
def generate_gbnf_grammar_from_pydantic_models(
|
||||||
models: List[Type[BaseModel]], outer_object_name: str = None, outer_object_content: str = None,
|
models: list[type[BaseModel]], outer_object_name: str | None = None, outer_object_content: str | None = None,
|
||||||
list_of_outputs: bool = False
|
list_of_outputs: bool = False
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
|
@ -528,7 +526,7 @@ def generate_gbnf_grammar_from_pydantic_models(
|
||||||
* grammar.
|
* grammar.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from.
|
models (list[type[BaseModel]]): A list of Pydantic models to generate the grammar from.
|
||||||
outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
|
outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
|
||||||
outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
|
outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
|
||||||
list_of_outputs (str, optional): Allows a list of output objects
|
list_of_outputs (str, optional): Allows a list of output objects
|
||||||
|
@ -543,9 +541,9 @@ def generate_gbnf_grammar_from_pydantic_models(
|
||||||
# root ::= UserModel | PostModel
|
# root ::= UserModel | PostModel
|
||||||
# ...
|
# ...
|
||||||
"""
|
"""
|
||||||
processed_models = set()
|
processed_models: set[type[BaseModel]] = set()
|
||||||
all_rules = []
|
all_rules = []
|
||||||
created_rules = {}
|
created_rules: dict[str, list[str]] = {}
|
||||||
if outer_object_name is None:
|
if outer_object_name is None:
|
||||||
for model in models:
|
for model in models:
|
||||||
model_rules, _ = generate_gbnf_grammar(model, processed_models, created_rules)
|
model_rules, _ = generate_gbnf_grammar(model, processed_models, created_rules)
|
||||||
|
@ -608,7 +606,7 @@ def get_primitive_grammar(grammar):
|
||||||
Returns:
|
Returns:
|
||||||
str: GBNF primitive grammar string.
|
str: GBNF primitive grammar string.
|
||||||
"""
|
"""
|
||||||
type_list = []
|
type_list: list[type[object]] = []
|
||||||
if "string-list" in grammar:
|
if "string-list" in grammar:
|
||||||
type_list.append(str)
|
type_list.append(str)
|
||||||
if "boolean-list" in grammar:
|
if "boolean-list" in grammar:
|
||||||
|
@ -666,14 +664,14 @@ triple-quotes ::= "'''" """
|
||||||
|
|
||||||
|
|
||||||
def generate_markdown_documentation(
|
def generate_markdown_documentation(
|
||||||
pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
|
pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
|
||||||
documentation_with_field_description=True
|
documentation_with_field_description=True
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Generate markdown documentation for a list of Pydantic models.
|
Generate markdown documentation for a list of Pydantic models.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
|
pydantic_models (list[type[BaseModel]]): list of Pydantic model classes.
|
||||||
model_prefix (str): Prefix for the model section.
|
model_prefix (str): Prefix for the model section.
|
||||||
fields_prefix (str): Prefix for the fields section.
|
fields_prefix (str): Prefix for the fields section.
|
||||||
documentation_with_field_description (bool): Include field descriptions in the documentation.
|
documentation_with_field_description (bool): Include field descriptions in the documentation.
|
||||||
|
@ -731,7 +729,7 @@ def generate_markdown_documentation(
|
||||||
|
|
||||||
|
|
||||||
def generate_field_markdown(
|
def generate_field_markdown(
|
||||||
field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
|
field_name: str, field_type: type[Any], model: type[BaseModel], depth=1,
|
||||||
documentation_with_field_description=True
|
documentation_with_field_description=True
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
|
@ -739,8 +737,8 @@ def generate_field_markdown(
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
field_name (str): Name of the field.
|
field_name (str): Name of the field.
|
||||||
field_type (Type[Any]): Type of the field.
|
field_type (type[Any]): Type of the field.
|
||||||
model (Type[BaseModel]): Pydantic model class.
|
model (type[BaseModel]): Pydantic model class.
|
||||||
depth (int): Indentation depth in the documentation.
|
depth (int): Indentation depth in the documentation.
|
||||||
documentation_with_field_description (bool): Include field descriptions in the documentation.
|
documentation_with_field_description (bool): Include field descriptions in the documentation.
|
||||||
|
|
||||||
|
@ -798,7 +796,7 @@ def generate_field_markdown(
|
||||||
return field_text
|
return field_text
|
||||||
|
|
||||||
|
|
||||||
def format_json_example(example: dict, depth: int) -> str:
|
def format_json_example(example: dict[str, Any], depth: int) -> str:
|
||||||
"""
|
"""
|
||||||
Format a JSON example into a readable string with indentation.
|
Format a JSON example into a readable string with indentation.
|
||||||
|
|
||||||
|
@ -819,14 +817,14 @@ def format_json_example(example: dict, depth: int) -> str:
|
||||||
|
|
||||||
|
|
||||||
def generate_text_documentation(
|
def generate_text_documentation(
|
||||||
pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
|
pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
|
||||||
documentation_with_field_description=True
|
documentation_with_field_description=True
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Generate text documentation for a list of Pydantic models.
|
Generate text documentation for a list of Pydantic models.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
|
pydantic_models (list[type[BaseModel]]): List of Pydantic model classes.
|
||||||
model_prefix (str): Prefix for the model section.
|
model_prefix (str): Prefix for the model section.
|
||||||
fields_prefix (str): Prefix for the fields section.
|
fields_prefix (str): Prefix for the fields section.
|
||||||
documentation_with_field_description (bool): Include field descriptions in the documentation.
|
documentation_with_field_description (bool): Include field descriptions in the documentation.
|
||||||
|
@ -885,7 +883,7 @@ def generate_text_documentation(
|
||||||
|
|
||||||
|
|
||||||
def generate_field_text(
|
def generate_field_text(
|
||||||
field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
|
field_name: str, field_type: type[Any], model: type[BaseModel], depth=1,
|
||||||
documentation_with_field_description=True
|
documentation_with_field_description=True
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
|
@ -893,8 +891,8 @@ def generate_field_text(
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
field_name (str): Name of the field.
|
field_name (str): Name of the field.
|
||||||
field_type (Type[Any]): Type of the field.
|
field_type (type[Any]): Type of the field.
|
||||||
model (Type[BaseModel]): Pydantic model class.
|
model (type[BaseModel]): Pydantic model class.
|
||||||
depth (int): Indentation depth in the documentation.
|
depth (int): Indentation depth in the documentation.
|
||||||
documentation_with_field_description (bool): Include field descriptions in the documentation.
|
documentation_with_field_description (bool): Include field descriptions in the documentation.
|
||||||
|
|
||||||
|
@ -1017,8 +1015,8 @@ def generate_and_save_gbnf_grammar_and_documentation(
|
||||||
pydantic_model_list,
|
pydantic_model_list,
|
||||||
grammar_file_path="./generated_grammar.gbnf",
|
grammar_file_path="./generated_grammar.gbnf",
|
||||||
documentation_file_path="./generated_grammar_documentation.md",
|
documentation_file_path="./generated_grammar_documentation.md",
|
||||||
outer_object_name: str = None,
|
outer_object_name: str | None = None,
|
||||||
outer_object_content: str = None,
|
outer_object_content: str | None = None,
|
||||||
model_prefix: str = "Output Model",
|
model_prefix: str = "Output Model",
|
||||||
fields_prefix: str = "Output Fields",
|
fields_prefix: str = "Output Fields",
|
||||||
list_of_outputs: bool = False,
|
list_of_outputs: bool = False,
|
||||||
|
@ -1053,8 +1051,8 @@ def generate_and_save_gbnf_grammar_and_documentation(
|
||||||
|
|
||||||
def generate_gbnf_grammar_and_documentation(
|
def generate_gbnf_grammar_and_documentation(
|
||||||
pydantic_model_list,
|
pydantic_model_list,
|
||||||
outer_object_name: str = None,
|
outer_object_name: str | None = None,
|
||||||
outer_object_content: str = None,
|
outer_object_content: str | None = None,
|
||||||
model_prefix: str = "Output Model",
|
model_prefix: str = "Output Model",
|
||||||
fields_prefix: str = "Output Fields",
|
fields_prefix: str = "Output Fields",
|
||||||
list_of_outputs: bool = False,
|
list_of_outputs: bool = False,
|
||||||
|
@ -1086,9 +1084,9 @@ def generate_gbnf_grammar_and_documentation(
|
||||||
|
|
||||||
|
|
||||||
def generate_gbnf_grammar_and_documentation_from_dictionaries(
|
def generate_gbnf_grammar_and_documentation_from_dictionaries(
|
||||||
dictionaries: List[dict],
|
dictionaries: list[dict[str, Any]],
|
||||||
outer_object_name: str = None,
|
outer_object_name: str | None = None,
|
||||||
outer_object_content: str = None,
|
outer_object_content: str | None = None,
|
||||||
model_prefix: str = "Output Model",
|
model_prefix: str = "Output Model",
|
||||||
fields_prefix: str = "Output Fields",
|
fields_prefix: str = "Output Fields",
|
||||||
list_of_outputs: bool = False,
|
list_of_outputs: bool = False,
|
||||||
|
@ -1098,7 +1096,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries(
|
||||||
Generate GBNF grammar and documentation from a list of dictionaries.
|
Generate GBNF grammar and documentation from a list of dictionaries.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
dictionaries (List[dict]): List of dictionaries representing Pydantic models.
|
dictionaries (list[dict]): List of dictionaries representing Pydantic models.
|
||||||
outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
|
outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
|
||||||
outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
|
outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
|
||||||
model_prefix (str): Prefix for the model section in the documentation.
|
model_prefix (str): Prefix for the model section in the documentation.
|
||||||
|
@ -1120,7 +1118,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries(
|
||||||
return grammar, documentation
|
return grammar, documentation
|
||||||
|
|
||||||
|
|
||||||
def create_dynamic_model_from_function(func: Callable):
|
def create_dynamic_model_from_function(func: Callable[..., Any]):
|
||||||
"""
|
"""
|
||||||
Creates a dynamic Pydantic model from a given function's type hints and adds the function as a 'run' method.
|
Creates a dynamic Pydantic model from a given function's type hints and adds the function as a 'run' method.
|
||||||
|
|
||||||
|
@ -1135,6 +1133,7 @@ def create_dynamic_model_from_function(func: Callable):
|
||||||
sig = inspect.signature(func)
|
sig = inspect.signature(func)
|
||||||
|
|
||||||
# Parse the docstring
|
# Parse the docstring
|
||||||
|
assert func.__doc__ is not None
|
||||||
docstring = parse(func.__doc__)
|
docstring = parse(func.__doc__)
|
||||||
|
|
||||||
dynamic_fields = {}
|
dynamic_fields = {}
|
||||||
|
@ -1157,7 +1156,6 @@ def create_dynamic_model_from_function(func: Callable):
|
||||||
f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring")
|
f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring")
|
||||||
|
|
||||||
# Add parameter details to the schema
|
# Add parameter details to the schema
|
||||||
param_doc = next((d for d in docstring.params if d.arg_name == param.name), None)
|
|
||||||
param_docs.append((param.name, param_doc))
|
param_docs.append((param.name, param_doc))
|
||||||
if param.default == inspect.Parameter.empty:
|
if param.default == inspect.Parameter.empty:
|
||||||
default_value = ...
|
default_value = ...
|
||||||
|
@ -1166,10 +1164,10 @@ def create_dynamic_model_from_function(func: Callable):
|
||||||
dynamic_fields[param.name] = (
|
dynamic_fields[param.name] = (
|
||||||
param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
|
param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
|
||||||
# Creating the dynamic model
|
# Creating the dynamic model
|
||||||
dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)
|
dynamic_model = create_model(f"{func.__name__}", **dynamic_fields) # type: ignore[call-overload]
|
||||||
|
|
||||||
for param_doc in param_docs:
|
for name, param_doc in param_docs:
|
||||||
dynamic_model.model_fields[param_doc[0]].description = param_doc[1].description
|
dynamic_model.model_fields[name].description = param_doc.description
|
||||||
|
|
||||||
dynamic_model.__doc__ = docstring.short_description
|
dynamic_model.__doc__ = docstring.short_description
|
||||||
|
|
||||||
|
@ -1182,16 +1180,16 @@ def create_dynamic_model_from_function(func: Callable):
|
||||||
return dynamic_model
|
return dynamic_model
|
||||||
|
|
||||||
|
|
||||||
def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
|
def add_run_method_to_dynamic_model(model: type[BaseModel], func: Callable[..., Any]):
|
||||||
"""
|
"""
|
||||||
Add a 'run' method to a dynamic Pydantic model, using the provided function.
|
Add a 'run' method to a dynamic Pydantic model, using the provided function.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model (Type[BaseModel]): Dynamic Pydantic model class.
|
model (type[BaseModel]): Dynamic Pydantic model class.
|
||||||
func (Callable): Function to be added as a 'run' method to the model.
|
func (Callable): Function to be added as a 'run' method to the model.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Type[BaseModel]: Pydantic model class with the added 'run' method.
|
type[BaseModel]: Pydantic model class with the added 'run' method.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def run_method_wrapper(self):
|
def run_method_wrapper(self):
|
||||||
|
@ -1204,15 +1202,15 @@ def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def create_dynamic_models_from_dictionaries(dictionaries: List[dict]):
|
def create_dynamic_models_from_dictionaries(dictionaries: list[dict[str, Any]]):
|
||||||
"""
|
"""
|
||||||
Create a list of dynamic Pydantic model classes from a list of dictionaries.
|
Create a list of dynamic Pydantic model classes from a list of dictionaries.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
dictionaries (List[dict]): List of dictionaries representing model structures.
|
dictionaries (list[dict]): List of dictionaries representing model structures.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List[Type[BaseModel]]: List of generated dynamic Pydantic model classes.
|
list[type[BaseModel]]: List of generated dynamic Pydantic model classes.
|
||||||
"""
|
"""
|
||||||
dynamic_models = []
|
dynamic_models = []
|
||||||
for func in dictionaries:
|
for func in dictionaries:
|
||||||
|
@ -1249,7 +1247,7 @@ def list_to_enum(enum_name, values):
|
||||||
return Enum(enum_name, {value: value for value in values})
|
return Enum(enum_name, {value: value for value in values})
|
||||||
|
|
||||||
|
|
||||||
def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "CustomModel") -> Type[BaseModel]:
|
def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name: str = "CustomModel") -> type[Any]:
|
||||||
"""
|
"""
|
||||||
Convert a dictionary to a Pydantic model class.
|
Convert a dictionary to a Pydantic model class.
|
||||||
|
|
||||||
|
@ -1258,9 +1256,9 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu
|
||||||
model_name (str): Name of the generated Pydantic model.
|
model_name (str): Name of the generated Pydantic model.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Type[BaseModel]: Generated Pydantic model class.
|
type[BaseModel]: Generated Pydantic model class.
|
||||||
"""
|
"""
|
||||||
fields = {}
|
fields: dict[str, Any] = {}
|
||||||
|
|
||||||
if "properties" in dictionary:
|
if "properties" in dictionary:
|
||||||
for field_name, field_data in dictionary.get("properties", {}).items():
|
for field_name, field_data in dictionary.get("properties", {}).items():
|
||||||
|
@ -1277,7 +1275,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu
|
||||||
if items != {}:
|
if items != {}:
|
||||||
array = {"properties": items}
|
array = {"properties": items}
|
||||||
array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
|
array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
|
||||||
fields[field_name] = (List[array_type], ...)
|
fields[field_name] = (List[array_type], ...) # type: ignore[valid-type]
|
||||||
else:
|
else:
|
||||||
fields[field_name] = (list, ...)
|
fields[field_name] = (list, ...)
|
||||||
elif field_type == "object":
|
elif field_type == "object":
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
set(TARGET server)
|
set(TARGET server)
|
||||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
add_executable(${TARGET} server.cpp json.hpp httplib.h)
|
add_executable(${TARGET} server.cpp oai.hpp utils.hpp json.hpp httplib.h)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_compile_definitions(${TARGET} PRIVATE
|
target_compile_definitions(${TARGET} PRIVATE
|
||||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||||
|
|
208
examples/server/oai.hpp
Normal file
208
examples/server/oai.hpp
Normal file
|
@ -0,0 +1,208 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <set>
|
||||||
|
#include <mutex>
|
||||||
|
#include <condition_variable>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
#include "json.hpp"
|
||||||
|
#include "utils.hpp"
|
||||||
|
|
||||||
|
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
|
||||||
|
|
||||||
|
using json = nlohmann::json;
|
||||||
|
|
||||||
|
inline static json oaicompat_completion_params_parse(
|
||||||
|
const json &body /* openai api json semantics */)
|
||||||
|
{
|
||||||
|
json llama_params;
|
||||||
|
|
||||||
|
llama_params["__oaicompat"] = true;
|
||||||
|
|
||||||
|
// Map OpenAI parameters to llama.cpp parameters
|
||||||
|
//
|
||||||
|
// For parameters that are defined by the OpenAI documentation (e.g.
|
||||||
|
// temperature), we explicitly specify OpenAI's intended default; we
|
||||||
|
// need to do that because sometimes OpenAI disagrees with llama.cpp
|
||||||
|
//
|
||||||
|
// https://platform.openai.com/docs/api-reference/chat/create
|
||||||
|
llama_sampling_params default_sparams;
|
||||||
|
llama_params["model"] = json_value(body, "model", std::string("unknown"));
|
||||||
|
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
|
||||||
|
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
||||||
|
llama_params["temperature"] = json_value(body, "temperature", 0.0);
|
||||||
|
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
|
||||||
|
llama_params["top_p"] = json_value(body, "top_p", 1.0);
|
||||||
|
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
|
||||||
|
llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
|
||||||
|
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
|
||||||
|
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
|
||||||
|
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
|
||||||
|
llama_params["stream"] = json_value(body, "stream", false);
|
||||||
|
llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat);
|
||||||
|
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
|
||||||
|
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
|
||||||
|
llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl);
|
||||||
|
llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p);
|
||||||
|
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
|
||||||
|
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
|
||||||
|
llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z);
|
||||||
|
|
||||||
|
if (body.count("grammar") != 0) {
|
||||||
|
llama_params["grammar"] = json_value(body, "grammar", json::object());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle 'stop' field
|
||||||
|
if (body.contains("stop") && body["stop"].is_string()) {
|
||||||
|
llama_params["stop"] = json::array({body["stop"].get<std::string>()});
|
||||||
|
} else {
|
||||||
|
llama_params["stop"] = json_value(body, "stop", json::array());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure there is ChatML-specific end sequence among stop words
|
||||||
|
llama_params["stop"].push_back("<|im_end|>");
|
||||||
|
|
||||||
|
return llama_params;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
|
||||||
|
{
|
||||||
|
json result = response.result_json;
|
||||||
|
|
||||||
|
bool stopped_word = result.count("stopped_word") != 0;
|
||||||
|
bool stopped_eos = json_value(result, "stopped_eos", false);
|
||||||
|
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
|
||||||
|
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
|
||||||
|
std::string content = json_value(result, "content", std::string(""));
|
||||||
|
|
||||||
|
std::string finish_reason = "length";
|
||||||
|
if (stopped_word || stopped_eos) {
|
||||||
|
finish_reason = "stop";
|
||||||
|
}
|
||||||
|
|
||||||
|
json choices =
|
||||||
|
streaming ? json::array({json{{"finish_reason", finish_reason},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json::object()}}})
|
||||||
|
: json::array({json{{"finish_reason", finish_reason},
|
||||||
|
{"index", 0},
|
||||||
|
{"message", json{{"content", content},
|
||||||
|
{"role", "assistant"}}}}});
|
||||||
|
|
||||||
|
std::time_t t = std::time(0);
|
||||||
|
|
||||||
|
json res =
|
||||||
|
json{{"choices", choices},
|
||||||
|
{"created", t},
|
||||||
|
{"model",
|
||||||
|
json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
||||||
|
{"object", streaming ? "chat.completion.chunk" : "chat.completion"},
|
||||||
|
{"usage",
|
||||||
|
json{{"completion_tokens", num_tokens_predicted},
|
||||||
|
{"prompt_tokens", num_prompt_tokens},
|
||||||
|
{"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
|
||||||
|
{"id", gen_chatcmplid()}};
|
||||||
|
|
||||||
|
if (server_verbose) {
|
||||||
|
res["__verbose"] = result;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result.contains("completion_probabilities")) {
|
||||||
|
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
// return value is vector as there is one case where we might need to generate two responses
|
||||||
|
inline static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
|
||||||
|
json result = response.result_json;
|
||||||
|
|
||||||
|
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
|
||||||
|
return std::vector<json>({response.result_json});
|
||||||
|
}
|
||||||
|
|
||||||
|
bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
|
||||||
|
std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
|
||||||
|
|
||||||
|
bool stopped_word = json_value(result, "stopped_word", false);
|
||||||
|
bool stopped_eos = json_value(result, "stopped_eos", false);
|
||||||
|
bool stopped_limit = json_value(result, "stopped_limit", false);
|
||||||
|
std::string content = json_value(result, "content", std::string(""));
|
||||||
|
|
||||||
|
std::string finish_reason;
|
||||||
|
if (stopped_word || stopped_eos) {
|
||||||
|
finish_reason = "stop";
|
||||||
|
}
|
||||||
|
if (stopped_limit) {
|
||||||
|
finish_reason = "length";
|
||||||
|
}
|
||||||
|
|
||||||
|
std::time_t t = std::time(0);
|
||||||
|
|
||||||
|
json choices;
|
||||||
|
|
||||||
|
if (!finish_reason.empty()) {
|
||||||
|
choices = json::array({json{{"finish_reason", finish_reason},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json::object()}}});
|
||||||
|
} else {
|
||||||
|
if (first) {
|
||||||
|
if (content.empty()) {
|
||||||
|
choices = json::array({json{{"finish_reason", nullptr},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json{{"role", "assistant"}}}}});
|
||||||
|
} else {
|
||||||
|
// We have to send this as two updates to conform to openai behavior
|
||||||
|
json initial_ret = json{{"choices", json::array({json{
|
||||||
|
{"finish_reason", nullptr},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json{
|
||||||
|
{"role", "assistant"}
|
||||||
|
}}}})},
|
||||||
|
{"created", t},
|
||||||
|
{"id", gen_chatcmplid()},
|
||||||
|
{"model", modelname},
|
||||||
|
{"object", "chat.completion.chunk"}};
|
||||||
|
|
||||||
|
json second_ret = json{
|
||||||
|
{"choices", json::array({json{{"finish_reason", nullptr},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json{
|
||||||
|
{"content", content}}}
|
||||||
|
}})},
|
||||||
|
{"created", t},
|
||||||
|
{"id", gen_chatcmplid()},
|
||||||
|
{"model", modelname},
|
||||||
|
{"object", "chat.completion.chunk"}};
|
||||||
|
|
||||||
|
return std::vector<json>({initial_ret, second_ret});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Some idiosyncrasy in task processing logic makes several trailing calls
|
||||||
|
// with empty content, we ignore these at the calee site.
|
||||||
|
if (content.empty()) {
|
||||||
|
return std::vector<json>({json::object()});
|
||||||
|
}
|
||||||
|
|
||||||
|
choices = json::array({json{
|
||||||
|
{"finish_reason", nullptr},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta",
|
||||||
|
json{
|
||||||
|
{"content", content},
|
||||||
|
}},
|
||||||
|
}});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
json ret = json{{"choices", choices},
|
||||||
|
{"created", t},
|
||||||
|
{"id", gen_chatcmplid()},
|
||||||
|
{"model", modelname},
|
||||||
|
{"object", "chat.completion.chunk"}};
|
||||||
|
|
||||||
|
return std::vector<json>({ret});
|
||||||
|
}
|
File diff suppressed because it is too large
Load diff
507
examples/server/utils.hpp
Normal file
507
examples/server/utils.hpp
Normal file
|
@ -0,0 +1,507 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <set>
|
||||||
|
#include <mutex>
|
||||||
|
#include <condition_variable>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
#include "json.hpp"
|
||||||
|
|
||||||
|
#include "../llava/clip.h"
|
||||||
|
|
||||||
|
using json = nlohmann::json;
|
||||||
|
|
||||||
|
extern bool server_verbose;
|
||||||
|
|
||||||
|
#ifndef SERVER_VERBOSE
|
||||||
|
#define SERVER_VERBOSE 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if SERVER_VERBOSE != 1
|
||||||
|
#define LOG_VERBOSE(MSG, ...)
|
||||||
|
#else
|
||||||
|
#define LOG_VERBOSE(MSG, ...) \
|
||||||
|
do \
|
||||||
|
{ \
|
||||||
|
if (server_verbose) \
|
||||||
|
{ \
|
||||||
|
server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
|
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
|
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
|
|
||||||
|
//
|
||||||
|
// parallel
|
||||||
|
//
|
||||||
|
|
||||||
|
enum server_state {
|
||||||
|
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
|
||||||
|
SERVER_STATE_READY, // Server is ready and model is loaded
|
||||||
|
SERVER_STATE_ERROR // An error occurred, load_model failed
|
||||||
|
};
|
||||||
|
|
||||||
|
enum task_type {
|
||||||
|
TASK_TYPE_COMPLETION,
|
||||||
|
TASK_TYPE_CANCEL,
|
||||||
|
TASK_TYPE_NEXT_RESPONSE
|
||||||
|
};
|
||||||
|
|
||||||
|
struct task_server {
|
||||||
|
int id = -1; // to be filled by llama_server_queue
|
||||||
|
int target_id;
|
||||||
|
task_type type;
|
||||||
|
json data;
|
||||||
|
bool infill_mode = false;
|
||||||
|
bool embedding_mode = false;
|
||||||
|
int multitask_id = -1;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct task_result {
|
||||||
|
int id;
|
||||||
|
int multitask_id = -1;
|
||||||
|
bool stop;
|
||||||
|
bool error;
|
||||||
|
json result_json;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct task_multi {
|
||||||
|
int id;
|
||||||
|
std::set<int> subtasks_remaining{};
|
||||||
|
std::vector<task_result> results{};
|
||||||
|
};
|
||||||
|
|
||||||
|
// TODO: can become bool if we can't find use of more states
|
||||||
|
enum slot_state
|
||||||
|
{
|
||||||
|
IDLE,
|
||||||
|
PROCESSING,
|
||||||
|
};
|
||||||
|
|
||||||
|
enum slot_command
|
||||||
|
{
|
||||||
|
NONE,
|
||||||
|
LOAD_PROMPT,
|
||||||
|
RELEASE,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct slot_params
|
||||||
|
{
|
||||||
|
bool stream = true;
|
||||||
|
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
|
||||||
|
|
||||||
|
uint32_t seed = -1; // RNG seed
|
||||||
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
|
|
||||||
|
std::vector<std::string> antiprompt;
|
||||||
|
|
||||||
|
json input_prefix;
|
||||||
|
json input_suffix;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct slot_image
|
||||||
|
{
|
||||||
|
int32_t id;
|
||||||
|
|
||||||
|
bool request_encode_image = false;
|
||||||
|
float * image_embedding = nullptr;
|
||||||
|
int32_t image_tokens = 0;
|
||||||
|
|
||||||
|
clip_image_u8 * img_data;
|
||||||
|
|
||||||
|
std::string prefix_prompt; // before of this image
|
||||||
|
};
|
||||||
|
|
||||||
|
// completion token output with probabilities
|
||||||
|
struct completion_token_output
|
||||||
|
{
|
||||||
|
struct token_prob
|
||||||
|
{
|
||||||
|
llama_token tok;
|
||||||
|
float prob;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<token_prob> probs;
|
||||||
|
llama_token tok;
|
||||||
|
std::string text_to_send;
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline void server_log(const char *level, const char *function, int line,
|
||||||
|
const char *message, const nlohmann::ordered_json &extra)
|
||||||
|
{
|
||||||
|
nlohmann::ordered_json log
|
||||||
|
{
|
||||||
|
{"timestamp", time(nullptr)},
|
||||||
|
{"level", level},
|
||||||
|
{"function", function},
|
||||||
|
{"line", line},
|
||||||
|
{"message", message},
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!extra.empty())
|
||||||
|
{
|
||||||
|
log.merge_patch(extra);
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||||
|
printf("%.*s\n", (int)str.size(), str.data());
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// server utils
|
||||||
|
//
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static T json_value(const json &body, const std::string &key, const T &default_value)
|
||||||
|
{
|
||||||
|
// Fallback null to default value
|
||||||
|
return body.contains(key) && !body.at(key).is_null()
|
||||||
|
? body.value(key, default_value)
|
||||||
|
: default_value;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::string format_chatml(std::vector<json> messages)
|
||||||
|
{
|
||||||
|
std::ostringstream chatml_msgs;
|
||||||
|
|
||||||
|
for (auto it = messages.begin(); it != messages.end(); ++it) {
|
||||||
|
chatml_msgs << "<|im_start|>"
|
||||||
|
<< json_value(*it, "role", std::string("user")) << '\n';
|
||||||
|
chatml_msgs << json_value(*it, "content", std::string(""))
|
||||||
|
<< "<|im_end|>\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
chatml_msgs << "<|im_start|>assistant" << '\n';
|
||||||
|
|
||||||
|
return chatml_msgs.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// work queue utils
|
||||||
|
//
|
||||||
|
|
||||||
|
struct llama_server_queue {
|
||||||
|
int id = 0;
|
||||||
|
std::mutex mutex_tasks;
|
||||||
|
// queues
|
||||||
|
std::vector<task_server> queue_tasks;
|
||||||
|
std::vector<task_server> queue_tasks_deferred;
|
||||||
|
std::vector<task_multi> queue_multitasks;
|
||||||
|
std::condition_variable condition_tasks;
|
||||||
|
// callback functions
|
||||||
|
std::function<void(task_server&)> callback_new_task;
|
||||||
|
std::function<void(task_multi&)> callback_finish_multitask;
|
||||||
|
std::function<void(void)> callback_all_task_finished;
|
||||||
|
|
||||||
|
// Add a new task to the end of the queue
|
||||||
|
int post(task_server task) {
|
||||||
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
|
if (task.id == -1) {
|
||||||
|
task.id = id++;
|
||||||
|
}
|
||||||
|
queue_tasks.push_back(std::move(task));
|
||||||
|
condition_tasks.notify_one();
|
||||||
|
return task.id;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add a new task, but defer until one slot is available
|
||||||
|
void defer(task_server task) {
|
||||||
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
|
queue_tasks_deferred.push_back(std::move(task));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the next id for creating anew task
|
||||||
|
int get_new_id() {
|
||||||
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
|
return id++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Register function to process a new task
|
||||||
|
void on_new_task(std::function<void(task_server&)> callback) {
|
||||||
|
callback_new_task = callback;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Register function to process a multitask
|
||||||
|
void on_finish_multitask(std::function<void(task_multi&)> callback) {
|
||||||
|
callback_finish_multitask = callback;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Register the function to be called when the batch of tasks is finished
|
||||||
|
void on_all_tasks_finished(std::function<void(void)> callback) {
|
||||||
|
callback_all_task_finished = callback;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call when the state of one slot is changed
|
||||||
|
void notify_slot_changed() {
|
||||||
|
// move deferred tasks back to main loop
|
||||||
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
|
for (auto & task : queue_tasks_deferred) {
|
||||||
|
queue_tasks.push_back(std::move(task));
|
||||||
|
}
|
||||||
|
queue_tasks_deferred.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start the main loop. This call is blocking
|
||||||
|
void start_loop() {
|
||||||
|
while (true) {
|
||||||
|
// new task arrived
|
||||||
|
LOG_VERBOSE("have new task", {});
|
||||||
|
{
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
|
if (queue_tasks.empty()) {
|
||||||
|
lock.unlock();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
task_server task = queue_tasks.front();
|
||||||
|
queue_tasks.erase(queue_tasks.begin());
|
||||||
|
lock.unlock();
|
||||||
|
LOG_VERBOSE("callback_new_task", {});
|
||||||
|
callback_new_task(task);
|
||||||
|
}
|
||||||
|
LOG_VERBOSE("callback_all_task_finished", {});
|
||||||
|
// process and update all the multitasks
|
||||||
|
auto queue_iterator = queue_multitasks.begin();
|
||||||
|
while (queue_iterator != queue_multitasks.end())
|
||||||
|
{
|
||||||
|
if (queue_iterator->subtasks_remaining.empty())
|
||||||
|
{
|
||||||
|
// all subtasks done == multitask is done
|
||||||
|
task_multi current_multitask = *queue_iterator;
|
||||||
|
callback_finish_multitask(current_multitask);
|
||||||
|
// remove this multitask
|
||||||
|
queue_iterator = queue_multitasks.erase(queue_iterator);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
++queue_iterator;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// all tasks in the current loop is finished
|
||||||
|
callback_all_task_finished();
|
||||||
|
}
|
||||||
|
LOG_VERBOSE("wait for new task", {});
|
||||||
|
// wait for new task
|
||||||
|
{
|
||||||
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
|
if (queue_tasks.empty()) {
|
||||||
|
condition_tasks.wait(lock, [&]{
|
||||||
|
return !queue_tasks.empty();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// functions to manage multitasks
|
||||||
|
//
|
||||||
|
|
||||||
|
// add a multitask by specifying the id of all subtask (subtask is a task_server)
|
||||||
|
void add_multitask(int multitask_id, std::vector<int>& sub_ids)
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||||
|
task_multi multi;
|
||||||
|
multi.id = multitask_id;
|
||||||
|
std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
|
||||||
|
queue_multitasks.push_back(multi);
|
||||||
|
}
|
||||||
|
|
||||||
|
// updatethe remaining subtasks, while appending results to multitask
|
||||||
|
void update_multitask(int multitask_id, int subtask_id, task_result& result)
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||||
|
for (auto& multitask : queue_multitasks)
|
||||||
|
{
|
||||||
|
if (multitask.id == multitask_id)
|
||||||
|
{
|
||||||
|
multitask.subtasks_remaining.erase(subtask_id);
|
||||||
|
multitask.results.push_back(result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_server_response {
|
||||||
|
typedef std::function<void(int, int, task_result&)> callback_multitask_t;
|
||||||
|
callback_multitask_t callback_update_multitask;
|
||||||
|
// for keeping track of all tasks waiting for the result
|
||||||
|
std::set<int> waiting_task_ids;
|
||||||
|
// the main result queue
|
||||||
|
std::vector<task_result> queue_results;
|
||||||
|
std::mutex mutex_results;
|
||||||
|
std::condition_variable condition_results;
|
||||||
|
|
||||||
|
void add_waiting_task_id(int task_id) {
|
||||||
|
std::unique_lock<std::mutex> lock(mutex_results);
|
||||||
|
waiting_task_ids.insert(task_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
void remove_waiting_task_id(int task_id) {
|
||||||
|
std::unique_lock<std::mutex> lock(mutex_results);
|
||||||
|
waiting_task_ids.erase(task_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
// This function blocks the thread until there is a response for this task_id
|
||||||
|
task_result recv(int task_id) {
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
std::unique_lock<std::mutex> lock(mutex_results);
|
||||||
|
condition_results.wait(lock, [&]{
|
||||||
|
return !queue_results.empty();
|
||||||
|
});
|
||||||
|
LOG_VERBOSE("condition_results unblock", {});
|
||||||
|
|
||||||
|
for (int i = 0; i < (int) queue_results.size(); i++)
|
||||||
|
{
|
||||||
|
if (queue_results[i].id == task_id)
|
||||||
|
{
|
||||||
|
assert(queue_results[i].multitask_id == -1);
|
||||||
|
task_result res = queue_results[i];
|
||||||
|
queue_results.erase(queue_results.begin() + i);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// should never reach here
|
||||||
|
}
|
||||||
|
|
||||||
|
// Register the function to update multitask
|
||||||
|
void on_multitask_update(callback_multitask_t callback) {
|
||||||
|
callback_update_multitask = callback;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send a new result to a waiting task_id
|
||||||
|
void send(task_result result) {
|
||||||
|
std::unique_lock<std::mutex> lock(mutex_results);
|
||||||
|
LOG_VERBOSE("send new result", {});
|
||||||
|
for (auto& task_id : waiting_task_ids) {
|
||||||
|
// LOG_TEE("waiting task id %i \n", task_id);
|
||||||
|
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
|
||||||
|
if (result.multitask_id == task_id)
|
||||||
|
{
|
||||||
|
LOG_VERBOSE("callback_update_multitask", {});
|
||||||
|
callback_update_multitask(task_id, result.id, result);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result.id == task_id)
|
||||||
|
{
|
||||||
|
LOG_VERBOSE("queue_results.push_back", {});
|
||||||
|
queue_results.push_back(result);
|
||||||
|
condition_results.notify_one();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
//
|
||||||
|
// base64 utils (TODO: move to common in the future)
|
||||||
|
//
|
||||||
|
|
||||||
|
static const std::string base64_chars =
|
||||||
|
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||||
|
"abcdefghijklmnopqrstuvwxyz"
|
||||||
|
"0123456789+/";
|
||||||
|
|
||||||
|
static inline bool is_base64(uint8_t c)
|
||||||
|
{
|
||||||
|
return (isalnum(c) || (c == '+') || (c == '/'));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
|
||||||
|
{
|
||||||
|
int i = 0;
|
||||||
|
int j = 0;
|
||||||
|
int in_ = 0;
|
||||||
|
|
||||||
|
int in_len = encoded_string.size();
|
||||||
|
|
||||||
|
uint8_t char_array_4[4];
|
||||||
|
uint8_t char_array_3[3];
|
||||||
|
|
||||||
|
std::vector<uint8_t> ret;
|
||||||
|
|
||||||
|
while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
|
||||||
|
{
|
||||||
|
char_array_4[i++] = encoded_string[in_]; in_++;
|
||||||
|
if (i == 4)
|
||||||
|
{
|
||||||
|
for (i = 0; i <4; i++)
|
||||||
|
{
|
||||||
|
char_array_4[i] = base64_chars.find(char_array_4[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
|
||||||
|
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
||||||
|
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
||||||
|
|
||||||
|
for (i = 0; (i < 3); i++)
|
||||||
|
{
|
||||||
|
ret.push_back(char_array_3[i]);
|
||||||
|
}
|
||||||
|
i = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i)
|
||||||
|
{
|
||||||
|
for (j = i; j <4; j++)
|
||||||
|
{
|
||||||
|
char_array_4[j] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (j = 0; j <4; j++)
|
||||||
|
{
|
||||||
|
char_array_4[j] = base64_chars.find(char_array_4[j]);
|
||||||
|
}
|
||||||
|
|
||||||
|
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
|
||||||
|
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
||||||
|
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
||||||
|
|
||||||
|
for (j = 0; (j < i - 1); j++)
|
||||||
|
{
|
||||||
|
ret.push_back(char_array_3[j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// random string / id
|
||||||
|
//
|
||||||
|
|
||||||
|
static std::string random_string()
|
||||||
|
{
|
||||||
|
static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
|
||||||
|
|
||||||
|
std::random_device rd;
|
||||||
|
std::mt19937 generator(rd());
|
||||||
|
|
||||||
|
std::string result(32, ' ');
|
||||||
|
|
||||||
|
for (int i = 0; i < 32; ++i) {
|
||||||
|
result[i] = str[generator() % str.size()];
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string gen_chatcmplid()
|
||||||
|
{
|
||||||
|
std::stringstream chatcmplid;
|
||||||
|
chatcmplid << "chatcmpl-" << random_string();
|
||||||
|
return chatcmplid.str();
|
||||||
|
}
|
|
@ -109,8 +109,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
||||||
if (block->size >= size) {
|
if (block->size >= size) {
|
||||||
best_fit_block = alloc->n_free_blocks - 1;
|
best_fit_block = alloc->n_free_blocks - 1;
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
|
||||||
__func__, size, max_avail);
|
__func__, tensor->name, size, max_avail);
|
||||||
GGML_ASSERT(!"not enough space in the buffer");
|
GGML_ASSERT(!"not enough space in the buffer");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
|
size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
|
||||||
return alloc->max_size;
|
// FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
|
||||||
|
// to avoid this, we add a 10% margin to the buffer size
|
||||||
|
return alloc->max_size + alloc->max_size/10;
|
||||||
}
|
}
|
||||||
|
|
||||||
// graph allocator
|
// graph allocator
|
||||||
|
|
|
@ -30,7 +30,9 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||||
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
||||||
// get_alloc_size is optional, defaults to ggml_nbytes
|
// get_alloc_size is optional, defaults to ggml_nbytes
|
||||||
if (buft->iface.get_alloc_size) {
|
if (buft->iface.get_alloc_size) {
|
||||||
return buft->iface.get_alloc_size(buft, tensor);
|
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
||||||
|
assert(size >= ggml_nbytes(tensor));
|
||||||
|
return size;
|
||||||
}
|
}
|
||||||
return ggml_nbytes(tensor);
|
return ggml_nbytes(tensor);
|
||||||
}
|
}
|
||||||
|
|
25
ggml-cuda.cu
25
ggml-cuda.cu
|
@ -4283,7 +4283,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
||||||
q8 += 8;
|
q8 += 8;
|
||||||
aux32 >>= 7;
|
aux32 >>= 7;
|
||||||
}
|
}
|
||||||
const float d = (float)bq2->d * (0.5f + aux32) * (float)bq8_1[ib32].ds.x * 0.25f;
|
const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;
|
||||||
return d * sumi;
|
return d * sumi;
|
||||||
#else
|
#else
|
||||||
// iqs is 0...15
|
// iqs is 0...15
|
||||||
|
@ -4294,7 +4294,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
||||||
const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
|
const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
|
||||||
const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
|
const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
|
||||||
const uint32_t aux32 = q2[2] | (q2[3] << 16);
|
const uint32_t aux32 = q2[2] | (q2[3] << 16);
|
||||||
const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * (float)bq8_1[ib32].ds.x * 0.25f;
|
const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
|
||||||
const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
|
const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
|
||||||
const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
|
const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
|
||||||
const int8_t * q8 = bq8_1[ib32].qs + 16*il;
|
const int8_t * q8 = bq8_1[ib32].qs + 16*il;
|
||||||
|
@ -4339,7 +4339,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
||||||
}
|
}
|
||||||
q8 += 8;
|
q8 += 8;
|
||||||
}
|
}
|
||||||
const float d = (float)bq2->d * (float)bq8_1[ib32].ds.x * 0.25f;
|
const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
|
||||||
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
||||||
#else
|
#else
|
||||||
assert(false);
|
assert(false);
|
||||||
|
@ -9790,8 +9790,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
// TODO: mmq/mmv support
|
// TODO: mmq/mmv support
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const int64_t nb11 = src1->nb[1];
|
const size_t nb11 = src1->nb[1];
|
||||||
const int64_t nb1 = dst->nb[1];
|
const size_t nb1 = dst->nb[1];
|
||||||
|
|
||||||
const struct ggml_tensor * ids = src0;
|
const struct ggml_tensor * ids = src0;
|
||||||
const int32_t id = ((int32_t *) dst->op_params)[0];
|
const int32_t id = ((int32_t *) dst->op_params)[0];
|
||||||
|
@ -10304,15 +10304,11 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
||||||
|
|
||||||
if (ggml_is_quantized(tensor->type)) {
|
if (ggml_is_quantized(tensor->type)) {
|
||||||
// initialize padding to 0 to avoid possible NaN values
|
// initialize padding to 0 to avoid possible NaN values
|
||||||
int64_t row_low = 0;
|
size_t original_size = ggml_nbytes(tensor);
|
||||||
int64_t row_high = ggml_nrows(tensor);
|
|
||||||
int64_t nrows_split = row_high - row_low;
|
|
||||||
|
|
||||||
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
|
|
||||||
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
||||||
|
|
||||||
if (padded_size > original_size && tensor->view_src == nullptr) {
|
if (padded_size > original_size && tensor->view_src == nullptr) {
|
||||||
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
|
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10415,12 +10411,7 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||||
int64_t row_low = 0;
|
size_t size = ggml_nbytes(tensor);
|
||||||
int64_t row_high = ggml_nrows(tensor);
|
|
||||||
int64_t nrows_split = row_high - row_low;
|
|
||||||
|
|
||||||
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
|
||||||
|
|
||||||
int64_t ne0 = tensor->ne[0];
|
int64_t ne0 = tensor->ne[0];
|
||||||
|
|
||||||
if (ggml_is_quantized(tensor->type)) {
|
if (ggml_is_quantized(tensor->type)) {
|
||||||
|
|
66
ggml-metal.m
66
ggml-metal.m
|
@ -26,15 +26,6 @@
|
||||||
|
|
||||||
#define GGML_METAL_MAX_KERNELS 256
|
#define GGML_METAL_MAX_KERNELS 256
|
||||||
|
|
||||||
struct ggml_metal_buffer {
|
|
||||||
const char * name;
|
|
||||||
|
|
||||||
void * data;
|
|
||||||
size_t size;
|
|
||||||
|
|
||||||
id<MTLBuffer> metal;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ggml_metal_kernel {
|
struct ggml_metal_kernel {
|
||||||
id<MTLFunction> function;
|
id<MTLFunction> function;
|
||||||
id<MTLComputePipelineState> pipeline;
|
id<MTLComputePipelineState> pipeline;
|
||||||
|
@ -172,9 +163,6 @@ struct ggml_metal_context {
|
||||||
|
|
||||||
dispatch_queue_t d_queue;
|
dispatch_queue_t d_queue;
|
||||||
|
|
||||||
int n_buffers;
|
|
||||||
struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
|
|
||||||
|
|
||||||
struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS];
|
struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS];
|
||||||
|
|
||||||
bool support_simdgroup_reduction;
|
bool support_simdgroup_reduction;
|
||||||
|
@ -242,24 +230,20 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
// Show all the Metal device instances in the system
|
// Show all the Metal device instances in the system
|
||||||
NSArray * devices = MTLCopyAllDevices();
|
NSArray * devices = MTLCopyAllDevices();
|
||||||
for (id<MTLDevice> device in devices) {
|
for (id<MTLDevice> device in devices) {
|
||||||
NSString * s = [device name];
|
GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]);
|
||||||
GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
|
|
||||||
}
|
}
|
||||||
[devices release]; // since it was created by a *Copy* C method
|
[devices release]; // since it was created by a *Copy* C method
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Pick and show default Metal device
|
// Pick and show default Metal device
|
||||||
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
||||||
NSString * s = [device name];
|
GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
|
||||||
GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);
|
|
||||||
|
|
||||||
// Configure context
|
// Configure context
|
||||||
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
||||||
ctx->device = device;
|
ctx->device = device;
|
||||||
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
||||||
ctx->queue = [ctx->device newCommandQueue];
|
ctx->queue = [ctx->device newCommandQueue];
|
||||||
ctx->n_buffers = 0;
|
|
||||||
|
|
||||||
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||||
|
|
||||||
// load library
|
// load library
|
||||||
|
@ -277,6 +261,10 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
||||||
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
|
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
|
||||||
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
||||||
|
if (error) {
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||||
|
|
||||||
|
@ -315,14 +303,13 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
//[options setFastMathEnabled:false];
|
//[options setFastMathEnabled:false];
|
||||||
|
|
||||||
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// print MTL GPU family:
|
// print MTL GPU family:
|
||||||
GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]);
|
GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]);
|
||||||
|
@ -531,10 +518,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
static void ggml_metal_free(struct ggml_metal_context * ctx) {
|
static void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||||
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
|
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
|
||||||
|
|
||||||
for (int i = 0; i < ctx->n_buffers; ++i) {
|
|
||||||
[ctx->buffers[i].metal release];
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
|
for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
|
||||||
if (ctx->kernels[i].pipeline) {
|
if (ctx->kernels[i].pipeline) {
|
||||||
[ctx->kernels[i].pipeline release];
|
[ctx->kernels[i].pipeline release];
|
||||||
|
@ -577,15 +560,13 @@ struct ggml_backend_metal_buffer_context {
|
||||||
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
||||||
// Metal buffer based on the host memory pointer
|
// Metal buffer based on the host memory pointer
|
||||||
//
|
//
|
||||||
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) {
|
||||||
//GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
//GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
||||||
|
|
||||||
const int64_t tsize = ggml_nbytes(t);
|
const int64_t tsize = ggml_nbytes(t);
|
||||||
|
|
||||||
ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
|
ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
|
||||||
|
|
||||||
// compatibility with ggml-backend
|
|
||||||
if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
|
|
||||||
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
|
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
|
||||||
|
|
||||||
// find the view that contains the tensor fully
|
// find the view that contains the tensor fully
|
||||||
|
@ -604,25 +585,6 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
||||||
|
|
||||||
GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
|
GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
|
||||||
|
|
||||||
return nil;
|
|
||||||
}
|
|
||||||
|
|
||||||
// find the view that contains the tensor fully
|
|
||||||
for (int i = 0; i < ctx->n_buffers; ++i) {
|
|
||||||
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
|
||||||
|
|
||||||
//GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
|
|
||||||
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
|
||||||
*offs = (size_t) ioffs;
|
|
||||||
|
|
||||||
//GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
|
|
||||||
|
|
||||||
return ctx->buffers[i].metal;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__);
|
|
||||||
|
|
||||||
return nil;
|
return nil;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -814,9 +776,9 @@ static bool ggml_metal_graph_compute(
|
||||||
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
||||||
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
|
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
|
||||||
|
|
||||||
id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
|
id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil;
|
||||||
id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
|
id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil;
|
||||||
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil;
|
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil;
|
||||||
|
|
||||||
//GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
|
//GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
|
||||||
//if (src0) {
|
//if (src0) {
|
||||||
|
@ -1598,7 +1560,7 @@ static bool ggml_metal_graph_compute(
|
||||||
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
|
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
|
||||||
|
|
||||||
size_t offs_src_cur = 0;
|
size_t offs_src_cur = 0;
|
||||||
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
|
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);
|
||||||
|
|
||||||
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
|
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
|
||||||
}
|
}
|
||||||
|
@ -1743,7 +1705,7 @@ static bool ggml_metal_graph_compute(
|
||||||
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
|
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
|
||||||
|
|
||||||
size_t offs_src_cur = 0;
|
size_t offs_src_cur = 0;
|
||||||
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
|
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);
|
||||||
|
|
||||||
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
|
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
|
||||||
}
|
}
|
||||||
|
|
2
ggml.c
2
ggml.c
|
@ -16597,7 +16597,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
{
|
{
|
||||||
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||||
{
|
{
|
||||||
|
|
|
@ -107,7 +107,7 @@ class GGUFReader:
|
||||||
offs, tensors_fields = self._build_tensors_fields(offs, tensor_count)
|
offs, tensors_fields = self._build_tensors_fields(offs, tensor_count)
|
||||||
new_align = self.fields.get('general.alignment')
|
new_align = self.fields.get('general.alignment')
|
||||||
if new_align is not None:
|
if new_align is not None:
|
||||||
if new_align.types != [GGUFValueType.UINT64]:
|
if new_align.types != [GGUFValueType.UINT32]:
|
||||||
raise ValueError('Bad type for general.alignment field')
|
raise ValueError('Bad type for general.alignment field')
|
||||||
self.alignment = new_align.parts[-1][0]
|
self.alignment = new_align.parts[-1][0]
|
||||||
padding = offs % self.alignment
|
padding = offs % self.alignment
|
||||||
|
|
454
llama.cpp
454
llama.cpp
|
@ -1669,6 +1669,9 @@ struct llama_context {
|
||||||
for (ggml_backend_t backend : backends) {
|
for (ggml_backend_t backend : backends) {
|
||||||
ggml_backend_free(backend);
|
ggml_backend_free(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_free(buf_input);
|
||||||
|
ggml_free(ctx_input);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_cparams cparams;
|
llama_cparams cparams;
|
||||||
|
@ -1715,8 +1718,14 @@ struct llama_context {
|
||||||
// allocator for the input tensors
|
// allocator for the input tensors
|
||||||
ggml_tallocr * alloc = nullptr;
|
ggml_tallocr * alloc = nullptr;
|
||||||
|
|
||||||
// temporary buffer for copying data to/from the backend
|
// input tensors
|
||||||
std::vector<no_init<uint8_t>> buf_copy;
|
ggml_backend_buffer_t buf_input = nullptr;
|
||||||
|
ggml_context * ctx_input = nullptr;
|
||||||
|
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
||||||
|
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
||||||
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
||||||
|
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
||||||
|
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
||||||
|
|
||||||
#ifdef GGML_USE_MPI
|
#ifdef GGML_USE_MPI
|
||||||
ggml_mpi_context * ctx_mpi = NULL;
|
ggml_mpi_context * ctx_mpi = NULL;
|
||||||
|
@ -4089,22 +4098,24 @@ static struct ggml_tensor * llm_build_inp_embd(
|
||||||
const llama_hparams & hparams,
|
const llama_hparams & hparams,
|
||||||
const llama_batch & batch,
|
const llama_batch & batch,
|
||||||
struct ggml_tensor * tok_embd,
|
struct ggml_tensor * tok_embd,
|
||||||
|
struct ggml_tensor * inp_tokens,
|
||||||
|
struct ggml_tensor * inp_embd,
|
||||||
const llm_build_cb & cb) {
|
const llm_build_cb & cb) {
|
||||||
const int64_t n_embd = hparams.n_embd;
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
if (batch.token) {
|
if (batch.token) {
|
||||||
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
|
struct ggml_tensor * inp_tokens_v = ggml_view_1d(ctx, inp_tokens, batch.n_tokens, 0);
|
||||||
cb(inp_tokens, "inp_tokens", -1);
|
cb(inp_tokens, "inp_tokens", -1);
|
||||||
|
|
||||||
inpL = ggml_get_rows(ctx, tok_embd, inp_tokens);
|
inpL = ggml_get_rows(ctx, tok_embd, inp_tokens_v);
|
||||||
} else {
|
} else {
|
||||||
#ifdef GGML_USE_MPI
|
#ifdef GGML_USE_MPI
|
||||||
GGML_ASSERT(false && "not implemented");
|
GGML_ASSERT(false && "not implemented");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
inpL = ggml_view_2d(ctx, inp_embd, n_embd, batch.n_tokens, inp_embd->nb[1], 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
return inpL;
|
return inpL;
|
||||||
|
@ -4118,6 +4129,7 @@ static void llm_build_k_shift(
|
||||||
const llama_cparams & cparams,
|
const llama_cparams & cparams,
|
||||||
const llama_kv_cache & kv,
|
const llama_kv_cache & kv,
|
||||||
struct ggml_cgraph * graph,
|
struct ggml_cgraph * graph,
|
||||||
|
struct ggml_tensor * K_shift,
|
||||||
llm_rope_type type,
|
llm_rope_type type,
|
||||||
int64_t n_ctx,
|
int64_t n_ctx,
|
||||||
float freq_base,
|
float freq_base,
|
||||||
|
@ -4134,9 +4146,6 @@ static void llm_build_k_shift(
|
||||||
const float beta_fast = cparams.yarn_beta_fast;
|
const float beta_fast = cparams.yarn_beta_fast;
|
||||||
const float beta_slow = cparams.yarn_beta_slow;
|
const float beta_slow = cparams.yarn_beta_slow;
|
||||||
|
|
||||||
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
|
|
||||||
cb(K_shift, "K_shift", -1);
|
|
||||||
|
|
||||||
int rope_type = 0;
|
int rope_type = 0;
|
||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
|
@ -4457,6 +4466,7 @@ static struct ggml_tensor * llm_build_kv(
|
||||||
|
|
||||||
struct llm_build_context {
|
struct llm_build_context {
|
||||||
const llama_model & model;
|
const llama_model & model;
|
||||||
|
const llama_context & lctx;
|
||||||
const llama_hparams & hparams;
|
const llama_hparams & hparams;
|
||||||
const llama_cparams & cparams;
|
const llama_cparams & cparams;
|
||||||
const llama_batch & batch;
|
const llama_batch & batch;
|
||||||
|
@ -4503,6 +4513,7 @@ struct llm_build_context {
|
||||||
const llm_build_cb & cb,
|
const llm_build_cb & cb,
|
||||||
bool worst_case) :
|
bool worst_case) :
|
||||||
model (lctx.model),
|
model (lctx.model),
|
||||||
|
lctx (lctx),
|
||||||
hparams (model.hparams),
|
hparams (model.hparams),
|
||||||
cparams (lctx.cparams),
|
cparams (lctx.cparams),
|
||||||
batch (batch),
|
batch (batch),
|
||||||
|
@ -4563,20 +4574,20 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||||
cb(inpL, "inp_embd", -1);
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||||
cb(inp_pos, "inp_pos", -1);
|
cb(inp_pos, "inp_pos", -1);
|
||||||
|
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
// shift the entire K-cache if needed
|
||||||
if (do_rope_shift) {
|
if (do_rope_shift) {
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
@ -4747,20 +4758,20 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||||
cb(inpL, "inp_embd", -1);
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||||
cb(inp_pos, "inp_pos", -1);
|
cb(inp_pos, "inp_pos", -1);
|
||||||
|
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
// shift the entire K-cache if needed
|
||||||
if (do_rope_shift) {
|
if (do_rope_shift) {
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
@ -4868,20 +4879,20 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||||
cb(inpL, "inp_embd", -1);
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||||
cb(inp_pos, "inp_pos", -1);
|
cb(inp_pos, "inp_pos", -1);
|
||||||
|
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
// shift the entire K-cache if needed
|
||||||
if (do_rope_shift) {
|
if (do_rope_shift) {
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
@ -4990,15 +5001,15 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * pos;
|
struct ggml_tensor * pos;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||||
cb(inpL, "inp_embd", -1);
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||||
cb(inp_pos, "inp_pos", -1);
|
cb(inp_pos, "inp_pos", -1);
|
||||||
|
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
||||||
|
@ -5087,19 +5098,19 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||||
cb(inpL, "inp_embd", -1);
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||||
cb(inp_pos, "inp_pos", -1);
|
cb(inp_pos, "inp_pos", -1);
|
||||||
|
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
if (do_rope_shift) {
|
if (do_rope_shift) {
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
@ -5294,11 +5305,11 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||||
cb(inpL, "inp_embd", -1);
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
@ -5384,11 +5395,11 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||||
cb(inpL, "inp_embd", -1);
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
inpL = llm_build_norm(ctx0, inpL, hparams,
|
inpL = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
@ -5477,11 +5488,11 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||||
cb(inpL, "inp_embd", -1);
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
@ -5573,20 +5584,20 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||||
cb(inpL, "inp_embd", -1);
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||||
cb(inp_pos, "inp_pos", -1);
|
cb(inp_pos, "inp_pos", -1);
|
||||||
|
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
// shift the entire K-cache if needed
|
||||||
if (do_rope_shift) {
|
if (do_rope_shift) {
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
@ -5696,20 +5707,20 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||||
cb(inpL, "inp_embd", -1);
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||||
cb(inp_pos, "inp_pos", -1);
|
cb(inp_pos, "inp_pos", -1);
|
||||||
|
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
// shift the entire K-cache if needed
|
||||||
if (do_rope_shift) {
|
if (do_rope_shift) {
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
@ -5810,20 +5821,20 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||||
cb(inpL, "inp_embd", -1);
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||||
cb(inp_pos, "inp_pos", -1);
|
cb(inp_pos, "inp_pos", -1);
|
||||||
|
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
// shift the entire K-cache if needed
|
||||||
if (do_rope_shift) {
|
if (do_rope_shift) {
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
@ -5931,20 +5942,20 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * ffn_output;
|
struct ggml_tensor * ffn_output;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||||
cb(inpL, "inp_embd", -1);
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||||
cb(inp_pos, "inp_pos", -1);
|
cb(inp_pos, "inp_pos", -1);
|
||||||
|
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
// shift the entire K-cache if needed
|
||||||
if (do_rope_shift) {
|
if (do_rope_shift) {
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
@ -6053,20 +6064,20 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||||
cb(inpL, "inp_embd", -1);
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||||
cb(inp_pos, "inp_pos", -1);
|
cb(inp_pos, "inp_pos", -1);
|
||||||
|
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
// shift the entire K-cache if needed
|
||||||
if (do_rope_shift) {
|
if (do_rope_shift) {
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
@ -6160,15 +6171,15 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * pos;
|
struct ggml_tensor * pos;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||||
cb(inpL, "inp_embd", -1);
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||||
cb(inp_pos, "inp_pos", -1);
|
cb(inp_pos, "inp_pos", -1);
|
||||||
|
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
||||||
|
@ -6258,20 +6269,20 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||||
cb(inpL, "inp_embd", -1);
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||||
cb(inp_pos, "inp_pos", -1);
|
cb(inp_pos, "inp_pos", -1);
|
||||||
|
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
// shift the entire K-cache if needed
|
||||||
if (do_rope_shift) {
|
if (do_rope_shift) {
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
@ -6365,15 +6376,7 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
// check if we should build the worst-case graph (for memory measurement)
|
// check if we should build the worst-case graph (for memory measurement)
|
||||||
const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
|
const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
|
||||||
|
|
||||||
// keep track of the input that has already been allocated
|
|
||||||
bool alloc_inp_tokens = false;
|
|
||||||
bool alloc_inp_embd = false;
|
|
||||||
bool alloc_inp_pos = false;
|
|
||||||
bool alloc_inp_KQ_mask = false;
|
|
||||||
bool alloc_inp_K_shift = false;
|
|
||||||
|
|
||||||
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
||||||
// TODO: improve handling of input and output tensors, then replace this with ggml_set_name
|
|
||||||
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
||||||
if (il >= 0) {
|
if (il >= 0) {
|
||||||
ggml_format_name(cur, "%s-%d", name, il);
|
ggml_format_name(cur, "%s-%d", name, il);
|
||||||
|
@ -6381,71 +6384,49 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
ggml_set_name(cur, name);
|
ggml_set_name(cur, name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (!lctx.cparams.offload_kqv) {
|
if (!lctx.cparams.offload_kqv) {
|
||||||
if (strcmp(name, "kqv_merged_cont") == 0) {
|
if (strcmp(name, "kqv_merged_cont") == 0) {
|
||||||
// all nodes between the KV store and the attention output are run on the CPU
|
// all nodes between the KV store and the attention output are run on the CPU
|
||||||
ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
|
ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_cgraph * result = NULL;
|
||||||
|
|
||||||
|
struct llm_build_context llm(lctx, batch, cb, worst_case);
|
||||||
|
|
||||||
//
|
//
|
||||||
// allocate input tensors and set input data
|
// set input data
|
||||||
//
|
//
|
||||||
|
|
||||||
if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
|
|
||||||
ggml_tallocr_alloc(lctx.alloc, cur);
|
|
||||||
|
|
||||||
if (!ggml_tallocr_is_measure(lctx.alloc) && batch.token) {
|
|
||||||
const int64_t n_tokens = cur->ne[0];
|
|
||||||
|
|
||||||
ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
|
|
||||||
}
|
|
||||||
|
|
||||||
alloc_inp_tokens = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) {
|
|
||||||
ggml_tallocr_alloc(lctx.alloc, cur);
|
|
||||||
|
|
||||||
if (!ggml_tallocr_is_measure(lctx.alloc) && batch.embd) {
|
|
||||||
const int64_t n_embd = cur->ne[0];
|
|
||||||
const int64_t n_tokens = cur->ne[1];
|
|
||||||
|
|
||||||
ggml_backend_tensor_set(cur, batch.embd, 0, n_tokens*n_embd*ggml_element_size(cur));
|
|
||||||
}
|
|
||||||
|
|
||||||
alloc_inp_embd = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
|
|
||||||
ggml_tallocr_alloc(lctx.alloc, cur);
|
|
||||||
|
|
||||||
if (!ggml_tallocr_is_measure(lctx.alloc) && batch.pos) {
|
|
||||||
const int64_t n_tokens = cur->ne[0];
|
|
||||||
|
|
||||||
static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
|
|
||||||
ggml_backend_tensor_set(cur, batch.pos, 0, n_tokens*ggml_element_size(cur));
|
|
||||||
}
|
|
||||||
|
|
||||||
alloc_inp_pos = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
|
|
||||||
ggml_tallocr_alloc(lctx.alloc, cur);
|
|
||||||
|
|
||||||
if (!ggml_tallocr_is_measure(lctx.alloc)) {
|
if (!ggml_tallocr_is_measure(lctx.alloc)) {
|
||||||
const int64_t n_kv = cur->ne[0];
|
if (batch.token) {
|
||||||
const int64_t n_tokens = cur->ne[1];
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
float * data;
|
ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
|
||||||
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
|
||||||
data = (float *) cur->data;
|
|
||||||
} else {
|
|
||||||
lctx.buf_copy.resize(ggml_nbytes(cur));
|
|
||||||
data = (float *) lctx.buf_copy.data();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (batch.embd) {
|
||||||
|
const int64_t n_embd = llm.n_embd;
|
||||||
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
|
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (batch.pos) {
|
||||||
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
|
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const int64_t n_kv = llm.n_kv;
|
||||||
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
||||||
|
float * data = (float *) lctx.inp_KQ_mask->data;
|
||||||
|
|
||||||
for (int h = 0; h < 1; ++h) {
|
for (int h = 0; h < 1; ++h) {
|
||||||
for (int j = 0; j < n_tokens; ++j) {
|
for (int j = 0; j < n_tokens; ++j) {
|
||||||
const llama_pos pos = batch.pos[j];
|
const llama_pos pos = batch.pos[j];
|
||||||
|
@ -6462,46 +6443,20 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (data != cur->data) {
|
|
||||||
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
alloc_inp_KQ_mask = true;
|
if (llm.do_rope_shift) {
|
||||||
}
|
const int64_t n_ctx = llm.n_ctx;
|
||||||
|
|
||||||
if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
||||||
ggml_tallocr_alloc(lctx.alloc, cur);
|
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
||||||
|
|
||||||
if (!ggml_tallocr_is_measure(lctx.alloc)) {
|
|
||||||
const int64_t n_ctx = cur->ne[0];
|
|
||||||
|
|
||||||
int32_t * data;
|
|
||||||
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
|
||||||
data = (int32_t *) cur->data;
|
|
||||||
} else {
|
|
||||||
lctx.buf_copy.resize(ggml_nbytes(cur));
|
|
||||||
data = (int32_t *) lctx.buf_copy.data();
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < n_ctx; ++i) {
|
for (int i = 0; i < n_ctx; ++i) {
|
||||||
data[i] = lctx.kv_self.cells[i].delta;
|
data[i] = lctx.kv_self.cells[i].delta;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (data != cur->data) {
|
|
||||||
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
alloc_inp_K_shift = true;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ggml_cgraph * result = NULL;
|
|
||||||
|
|
||||||
struct llm_build_context llm(lctx, batch, cb, worst_case);
|
|
||||||
|
|
||||||
llm.init();
|
llm.init();
|
||||||
|
|
||||||
switch (model.arch) {
|
switch (model.arch) {
|
||||||
|
@ -8001,10 +7956,57 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can
|
||||||
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
||||||
return a.logit > b.logit;
|
return a.logit > b.logit;
|
||||||
};
|
};
|
||||||
if (k == (int) candidates->size) {
|
if (k <= 128) {
|
||||||
std::sort(candidates->data, candidates->data + candidates->size, comp);
|
|
||||||
} else {
|
|
||||||
std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
|
std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
|
||||||
|
} else {
|
||||||
|
constexpr int nbuckets = 128;
|
||||||
|
constexpr float bucket_low = -10.0f;
|
||||||
|
constexpr float bucket_high = 10.0f;
|
||||||
|
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
||||||
|
constexpr float bucker_inter = -bucket_low * bucket_scale;
|
||||||
|
|
||||||
|
std::vector<int> bucket_idx(candidates->size);
|
||||||
|
std::vector<int> histo(nbuckets, 0);
|
||||||
|
|
||||||
|
for (int i = 0; i < (int)candidates->size; ++i) {
|
||||||
|
const float val = candidates->data[i].logit;
|
||||||
|
int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
||||||
|
ib = std::max(0, std::min(nbuckets-1, ib));
|
||||||
|
bucket_idx[i] = ib;
|
||||||
|
++histo[ib];
|
||||||
|
}
|
||||||
|
int nhave = 0;
|
||||||
|
int ib = nbuckets - 1;
|
||||||
|
for ( ; ib >= 0; --ib) {
|
||||||
|
nhave += histo[ib];
|
||||||
|
if (nhave >= k) break;
|
||||||
|
}
|
||||||
|
std::vector<llama_token_data> tmp_tokens(nhave);
|
||||||
|
auto ptr = tmp_tokens.data();
|
||||||
|
std::vector<llama_token_data*> bucket_ptrs;
|
||||||
|
bucket_ptrs.reserve(nbuckets - ib);
|
||||||
|
for (int j = nbuckets - 1; j >= ib; --j) {
|
||||||
|
bucket_ptrs.push_back(ptr);
|
||||||
|
ptr += histo[j];
|
||||||
|
}
|
||||||
|
for (int i = 0; i < (int)candidates->size; ++i) {
|
||||||
|
int j = bucket_idx[i];
|
||||||
|
if (j >= ib) {
|
||||||
|
*bucket_ptrs[nbuckets-1-j]++ = candidates->data[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ptr = tmp_tokens.data();
|
||||||
|
int ndone = 0;
|
||||||
|
for (int j = nbuckets-1; j > ib; --j) {
|
||||||
|
std::sort(ptr, ptr + histo[j], comp);
|
||||||
|
ptr += histo[j];
|
||||||
|
ndone += histo[j];
|
||||||
|
}
|
||||||
|
std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
|
||||||
|
|
||||||
|
std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data));
|
||||||
|
|
||||||
}
|
}
|
||||||
candidates->sorted = true;
|
candidates->sorted = true;
|
||||||
}
|
}
|
||||||
|
@ -8196,6 +8198,73 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
|
||||||
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
|
// no need to do anything if there is only one (or zero) candidates
|
||||||
|
if(candidates_p->size <= 1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate maximum possible entropy
|
||||||
|
float max_entropy = -logf(1.0f / candidates_p->size);
|
||||||
|
|
||||||
|
llama_sample_softmax(nullptr, candidates_p);
|
||||||
|
|
||||||
|
// Calculate entropy of the softmax probabilities
|
||||||
|
float entropy = 0.0f;
|
||||||
|
for (size_t i = 0; i < candidates_p->size; ++i) {
|
||||||
|
float prob = candidates_p->data[i].p;
|
||||||
|
if (prob > 0.0f) { // Ensure no log(0)
|
||||||
|
entropy -= prob * logf(prob);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalize the entropy (max_entropy cannot be 0 here because we checked candidates_p->size != 1 above)
|
||||||
|
float normalized_entropy = entropy / max_entropy;
|
||||||
|
|
||||||
|
// Map the normalized entropy to the desired temperature range using the power function
|
||||||
|
float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
|
||||||
|
|
||||||
|
#ifdef DEBUG
|
||||||
|
LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
|
||||||
|
LLAMA_LOG_INFO("Entropy: %f\n", entropy);
|
||||||
|
LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
|
||||||
|
LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
|
||||||
|
LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
|
||||||
|
LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Apply the dynamically calculated temperature scaling
|
||||||
|
for (size_t i = 0; i < candidates_p->size; ++i) {
|
||||||
|
candidates_p->data[i].logit /= dyn_temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Re-compute softmax probabilities after scaling logits with dynamic temperature
|
||||||
|
double max_l_double = candidates_p->data[0].logit;
|
||||||
|
double cum_sum_double = 0.0;
|
||||||
|
for (size_t i = 0; i < candidates_p->size; ++i) {
|
||||||
|
double p = exp(candidates_p->data[i].logit - max_l_double);
|
||||||
|
candidates_p->data[i].p = p; // Store the scaled probability
|
||||||
|
cum_sum_double += p;
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < candidates_p->size; ++i) {
|
||||||
|
candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef DEBUG
|
||||||
|
// Print the updated top 25 probabilities after temperature scaling
|
||||||
|
LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
|
||||||
|
for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) {
|
||||||
|
LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (ctx) {
|
||||||
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
|
@ -8874,6 +8943,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||||
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
|
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
|
||||||
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
||||||
};
|
};
|
||||||
|
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
|
||||||
|
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
|
||||||
|
if (n_expert > 1) {
|
||||||
|
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
|
||||||
|
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
||||||
|
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
||||||
|
// tensor name.
|
||||||
|
n_layer /= n_expert;
|
||||||
|
if (sscanf(name, "blk.%d.", &i_layer) != 1) {
|
||||||
|
throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
|
||||||
|
}
|
||||||
|
if (i_layer < 0 || i_layer >= n_layer) {
|
||||||
|
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return std::make_pair(i_layer, n_layer);
|
||||||
|
};
|
||||||
|
|
||||||
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
||||||
int nx = tensor->ne[0];
|
int nx = tensor->ne[0];
|
||||||
|
@ -8935,24 +9021,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||||
new_type = GGML_TYPE_Q2_K;
|
new_type = GGML_TYPE_Q2_K;
|
||||||
}
|
}
|
||||||
} else if (name.find("ffn_down") != std::string::npos) {
|
} else if (name.find("ffn_down") != std::string::npos) {
|
||||||
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
|
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
|
||||||
int i_layer, n_layer;
|
int i_layer = info.first, n_layer = info.second;
|
||||||
if (n_expert == 1) {
|
|
||||||
i_layer = qs.i_ffn_down;
|
|
||||||
n_layer = qs.n_ffn_down;
|
|
||||||
} else {
|
|
||||||
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
|
|
||||||
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
|
||||||
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
|
||||||
// tensor name.
|
|
||||||
n_layer = qs.n_ffn_down / n_expert;
|
|
||||||
if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
|
|
||||||
throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
|
|
||||||
}
|
|
||||||
if (i_layer < 0 || i_layer >= n_layer) {
|
|
||||||
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
||||||
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
||||||
|
@ -9008,13 +9078,17 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
||||||
}
|
}
|
||||||
else if (name.find("ffn_gate") != std::string::npos) {
|
else if (name.find("ffn_gate") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_gate, qs.n_ffn_gate)) {
|
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
|
||||||
|
int i_layer = info.first, n_layer = info.second;
|
||||||
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
|
||||||
new_type = GGML_TYPE_Q2_K;
|
new_type = GGML_TYPE_Q2_K;
|
||||||
}
|
}
|
||||||
++qs.i_ffn_gate;
|
++qs.i_ffn_gate;
|
||||||
}
|
}
|
||||||
else if (name.find("ffn_up") != std::string::npos) {
|
else if (name.find("ffn_up") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_up, qs.n_ffn_up)) {
|
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
|
||||||
|
int i_layer = info.first, n_layer = info.second;
|
||||||
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
|
||||||
new_type = GGML_TYPE_Q2_K;
|
new_type = GGML_TYPE_Q2_K;
|
||||||
}
|
}
|
||||||
++qs.i_ffn_up;
|
++qs.i_ffn_up;
|
||||||
|
@ -9964,6 +10038,35 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ctx->embedding.resize(hparams.n_embd);
|
ctx->embedding.resize(hparams.n_embd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// graph inputs
|
||||||
|
{
|
||||||
|
ggml_init_params init_params = {
|
||||||
|
/* .mem_size */ ggml_tensor_overhead()*5,
|
||||||
|
/* .mem_buffer */ nullptr,
|
||||||
|
/* .no_alloc */ true,
|
||||||
|
};
|
||||||
|
ctx->ctx_input = ggml_init(init_params);
|
||||||
|
|
||||||
|
ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
||||||
|
ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
|
||||||
|
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
||||||
|
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
||||||
|
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
||||||
|
|
||||||
|
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
||||||
|
ggml_set_name(ctx->inp_embd, "inp_embd");
|
||||||
|
ggml_set_name(ctx->inp_pos, "inp_pos");
|
||||||
|
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
||||||
|
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
||||||
|
|
||||||
|
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
|
||||||
|
ggml_backend_buffer_name(ctx->buf_input),
|
||||||
|
ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// scheduler and compute buffers
|
||||||
{
|
{
|
||||||
// buffer types used for the compute buffer of each backend
|
// buffer types used for the compute buffer of each backend
|
||||||
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||||
|
@ -9990,9 +10093,6 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
// initialize scheduler with the worst-case graph
|
// initialize scheduler with the worst-case graph
|
||||||
ggml_backend_sched_init_measure(ctx->sched, gf);
|
ggml_backend_sched_init_measure(ctx->sched, gf);
|
||||||
// note: the number of splits during measure is higher than during inference due to the kv shift
|
|
||||||
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
|
||||||
LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
|
|
||||||
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
||||||
|
|
||||||
for (ggml_backend_t backend : ctx->backends) {
|
for (ggml_backend_t backend : ctx->backends) {
|
||||||
|
@ -10001,6 +10101,10 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ggml_backend_buffer_name(buf),
|
ggml_backend_buffer_name(buf),
|
||||||
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// note: the number of splits during measure is higher than during inference due to the kv shift
|
||||||
|
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
||||||
|
LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
8
llama.h
8
llama.h
|
@ -775,6 +775,14 @@ extern "C" {
|
||||||
float p,
|
float p,
|
||||||
size_t min_keep);
|
size_t min_keep);
|
||||||
|
|
||||||
|
/// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
|
||||||
|
LLAMA_API void llama_sample_entropy(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
llama_token_data_array * candidates_p,
|
||||||
|
float min_temp,
|
||||||
|
float max_temp,
|
||||||
|
float exponent_val);
|
||||||
|
|
||||||
LLAMA_API void llama_sample_temp(
|
LLAMA_API void llama_sample_temp(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
llama_token_data_array * candidates,
|
llama_token_data_array * candidates,
|
||||||
|
|
50
scripts/ci-run.sh
Executable file
50
scripts/ci-run.sh
Executable file
|
@ -0,0 +1,50 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
this=$(realpath "$0"); readonly this
|
||||||
|
cd "$(dirname "$this")"
|
||||||
|
shellcheck "$this"
|
||||||
|
|
||||||
|
if (( $# != 1 && $# != 2 )); then
|
||||||
|
cat >&2 <<'EOF'
|
||||||
|
usage:
|
||||||
|
ci-run.sh <tmp_dir> [<cache_dir>]
|
||||||
|
|
||||||
|
This script wraps ci/run.sh:
|
||||||
|
* If <tmp_dir> is a ramdisk, you can reduce writes to your SSD. If <tmp_dir> is not a ramdisk, keep in mind that total writes will increase by the size of <cache_dir>.
|
||||||
|
(openllama_3b_v2: quantized models are about 30GB)
|
||||||
|
* Persistent model and data files are synced to and from <cache_dir>,
|
||||||
|
excluding generated .gguf files.
|
||||||
|
(openllama_3b_v2: persistent files are about 6.6GB)
|
||||||
|
* <cache_dir> defaults to ~/.cache/llama.cpp
|
||||||
|
EOF
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
cd .. # => llama.cpp repo root
|
||||||
|
|
||||||
|
tmp="$1"
|
||||||
|
mkdir -p "$tmp"
|
||||||
|
tmp=$(realpath "$tmp")
|
||||||
|
echo >&2 "Using tmp=$tmp"
|
||||||
|
|
||||||
|
cache="${2-$HOME/.cache/llama.cpp}"
|
||||||
|
mkdir -p "$cache"
|
||||||
|
cache=$(realpath "$cache")
|
||||||
|
echo >&2 "Using cache=$cache"
|
||||||
|
|
||||||
|
_sync() {
|
||||||
|
local from="$1"; shift
|
||||||
|
local to="$1"; shift
|
||||||
|
|
||||||
|
echo >&2 "Syncing from $from to $to"
|
||||||
|
mkdir -p "$from" "$to"
|
||||||
|
rsync -a "$from" "$to" --delete-during "$@"
|
||||||
|
}
|
||||||
|
|
||||||
|
_sync "$(realpath .)/" "$tmp/llama.cpp"
|
||||||
|
_sync "$cache/ci-mnt/models/" "$tmp/llama.cpp/ci-mnt/models/"
|
||||||
|
|
||||||
|
cd "$tmp/llama.cpp"
|
||||||
|
bash ci/run.sh ci-out ci-mnt
|
||||||
|
|
||||||
|
_sync 'ci-mnt/models/' "$cache/ci-mnt/models/" --exclude='*.gguf' -P
|
|
@ -46,7 +46,7 @@ Formatting considerations:
|
||||||
- To define multiple "reverse_prompt" properties simultaneously the expected format is a list of strings.
|
- To define multiple "reverse_prompt" properties simultaneously the expected format is a list of strings.
|
||||||
- To define a tensor split, pass a list of floats.
|
- To define a tensor split, pass a list of floats.
|
||||||
"""
|
"""
|
||||||
usage = "run_with_preset.py [-h] [yaml_files ...] [--<ARG_NAME> <ARG_VALUE> ...]"
|
usage = "run-with-preset.py [-h] [yaml_files ...] [--<ARG_NAME> <ARG_VALUE> ...]"
|
||||||
epilog = (" --<ARG_NAME> specify additional CLI ars to be passed to the binary (override all preset files). "
|
epilog = (" --<ARG_NAME> specify additional CLI ars to be passed to the binary (override all preset files). "
|
||||||
"Unknown args will be ignored.")
|
"Unknown args will be ignored.")
|
||||||
|
|
3
tests/.gitignore
vendored
Normal file
3
tests/.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
*
|
||||||
|
!*.*
|
||||||
|
test-c.o
|
|
@ -1,6 +1,6 @@
|
||||||
function(llama_build_executable source)
|
function(llama_build_executable source)
|
||||||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||||
add_executable(${TEST_TARGET} ${source})
|
add_executable(${TEST_TARGET} ${source} get-model.cpp)
|
||||||
install(TARGETS ${TEST_TARGET} RUNTIME)
|
install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||||
target_link_libraries(${TEST_TARGET} PRIVATE common)
|
target_link_libraries(${TEST_TARGET} PRIVATE common)
|
||||||
endfunction()
|
endfunction()
|
||||||
|
@ -8,14 +8,20 @@ endfunction()
|
||||||
function(llama_test_executable name source)
|
function(llama_test_executable name source)
|
||||||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||||
add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
||||||
|
set_property(TEST ${name} PROPERTY LABELS "main")
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
function(llama_build_and_test_executable source)
|
function(llama_build_and_test_executable source)
|
||||||
|
llama_build_and_test_executable_with_label(${source} "main")
|
||||||
|
endfunction()
|
||||||
|
|
||||||
|
function(llama_build_and_test_executable_with_label source label)
|
||||||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||||
add_executable(${TEST_TARGET} ${source})
|
add_executable(${TEST_TARGET} ${source} get-model.cpp)
|
||||||
install(TARGETS ${TEST_TARGET} RUNTIME)
|
install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||||
target_link_libraries(${TEST_TARGET} PRIVATE common)
|
target_link_libraries(${TEST_TARGET} PRIVATE common)
|
||||||
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
||||||
|
set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${label})
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
# llama_build_and_test_executable(test-double-float.cpp) # SLOW
|
# llama_build_and_test_executable(test-double-float.cpp) # SLOW
|
||||||
|
@ -49,10 +55,12 @@ llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||||
llama_build_and_test_executable(test-grad0.cpp)
|
llama_build_and_test_executable(test-grad0.cpp)
|
||||||
# llama_build_and_test_executable(test-opt.cpp) # SLOW
|
# llama_build_and_test_executable(test-opt.cpp) # SLOW
|
||||||
llama_build_and_test_executable(test-backend-ops.cpp)
|
llama_build_and_test_executable(test-backend-ops.cpp)
|
||||||
llama_build_and_test_executable(test-autorelease.cpp)
|
|
||||||
|
|
||||||
llama_build_and_test_executable(test-rope.cpp)
|
llama_build_and_test_executable(test-rope.cpp)
|
||||||
|
|
||||||
|
llama_build_and_test_executable_with_label(test-model-load-cancel.cpp "model")
|
||||||
|
llama_build_and_test_executable_with_label(test-autorelease.cpp "model")
|
||||||
|
|
||||||
# dummy executable - not installed
|
# dummy executable - not installed
|
||||||
get_filename_component(TEST_TARGET test-c.c NAME_WE)
|
get_filename_component(TEST_TARGET test-c.c NAME_WE)
|
||||||
add_executable(${TEST_TARGET} test-c.c)
|
add_executable(${TEST_TARGET} test-c.c)
|
||||||
|
|
21
tests/get-model.cpp
Normal file
21
tests/get-model.cpp
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
|
#include "get-model.h"
|
||||||
|
|
||||||
|
char * get_model_or_exit(int argc, char *argv[]) {
|
||||||
|
char * model_path;
|
||||||
|
if (argc > 1) {
|
||||||
|
model_path = argv[1];
|
||||||
|
|
||||||
|
} else {
|
||||||
|
model_path = getenv("LLAMACPP_TEST_MODELFILE");
|
||||||
|
if (!model_path || strlen(model_path) == 0) {
|
||||||
|
fprintf(stderr, "\033[33mWARNING: No model file provided. Skipping this test. Set LLAMACPP_TEST_MODELFILE=<gguf_model_path> to silence this warning and run this test.\n\033[0m");
|
||||||
|
exit(EXIT_SUCCESS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return model_path;
|
||||||
|
}
|
2
tests/get-model.h
Normal file
2
tests/get-model.h
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
#pragma once
|
||||||
|
char * get_model_or_exit(int, char*[]);
|
|
@ -5,19 +5,15 @@
|
||||||
#include <thread>
|
#include <thread>
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "get-model.h"
|
||||||
|
|
||||||
// This creates a new context inside a pthread and then tries to exit cleanly.
|
// This creates a new context inside a pthread and then tries to exit cleanly.
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
if (argc < 2) {
|
auto * model_path = get_model_or_exit(argc, argv);
|
||||||
printf("Usage: %s model.gguf\n", argv[0]);
|
|
||||||
return 0; // intentionally return success
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string fname = argv[1];
|
std::thread([&model_path]() {
|
||||||
|
|
||||||
std::thread([&fname]() {
|
|
||||||
llama_backend_init(false);
|
llama_backend_init(false);
|
||||||
auto * model = llama_load_model_from_file(fname.c_str(), llama_model_default_params());
|
auto * model = llama_load_model_from_file(model_path, llama_model_default_params());
|
||||||
auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
|
auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
27
tests/test-model-load-cancel.cpp
Normal file
27
tests/test-model-load-cancel.cpp
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
#include "llama.h"
|
||||||
|
#include "get-model.h"
|
||||||
|
|
||||||
|
#include <cstdlib>
|
||||||
|
|
||||||
|
int main(int argc, char *argv[] ) {
|
||||||
|
auto * model_path = get_model_or_exit(argc, argv);
|
||||||
|
auto * file = fopen(model_path, "r");
|
||||||
|
if (file == nullptr) {
|
||||||
|
fprintf(stderr, "no model at '%s' found\n", model_path);
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "using '%s'\n", model_path);
|
||||||
|
fclose(file);
|
||||||
|
|
||||||
|
llama_backend_init(false);
|
||||||
|
auto params = llama_model_params{};
|
||||||
|
params.use_mmap = false;
|
||||||
|
params.progress_callback = [](float progress, void * ctx){
|
||||||
|
(void) ctx;
|
||||||
|
return progress > 0.50;
|
||||||
|
};
|
||||||
|
auto * model = llama_load_model_from_file(model_path, params);
|
||||||
|
llama_backend_free();
|
||||||
|
return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue