Merge branch 'master' into stablelm-support

This commit is contained in:
Galunid 2023-11-07 11:22:13 +01:00
commit 698c94595e
24 changed files with 1437 additions and 777 deletions

View file

@ -288,6 +288,7 @@ jobs:
OPENBLAS_VERSION: 0.3.23 OPENBLAS_VERSION: 0.3.23
OPENCL_VERSION: 2023.04.17 OPENCL_VERSION: 2023.04.17
CLBLAST_VERSION: 1.6.0 CLBLAST_VERSION: 1.6.0
SDE_VERSION: 9.21.1-2023-04-24
strategy: strategy:
matrix: matrix:
@ -383,11 +384,23 @@ jobs:
- name: Test - name: Test
id: cmake_test id: cmake_test
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
run: | run: |
cd build cd build
ctest -C Release --verbose --timeout 900 ctest -C Release --verbose --timeout 900
- name: Test (Intel SDE)
id: cmake_test_sde
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
run: |
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz"
# for some weird reason windows tar doesn't like sde tar.xz
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
cd build
& $sde -future -- ctest -C Release --verbose --timeout 900
- name: Determine tag name - name: Determine tag name
id: tag id: tag
shell: bash shell: bash

2
.gitignore vendored
View file

@ -46,7 +46,7 @@ models-mnt
/infill /infill
/libllama.so /libllama.so
/llama-bench /llama-bench
/llava /llava-cli
/main /main
/metal /metal
/perplexity /perplexity

View file

@ -10,7 +10,7 @@ endif()
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
set(LLAMA_STANDALONE ON) set(LLAMA_STANDALONE ON)
# configure project version # configure project version
@ -44,7 +44,7 @@ endif()
# general # general
option(LLAMA_STATIC "llama: static link libraries" OFF) option(LLAMA_STATIC "llama: static link libraries" OFF)
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF) option(LLAMA_NATIVE "llama: enable -march=native flag" ON)
option(LLAMA_LTO "llama: enable link time optimization" OFF) option(LLAMA_LTO "llama: enable link time optimization" OFF)
# debug # debug
@ -510,6 +510,10 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" ) elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
message(STATUS "x86 detected") message(STATUS "x86 detected")
if (MSVC) if (MSVC)
# instruction set detection for MSVC only
if (LLAMA_NATIVE)
include(cmake/FindSIMD.cmake)
endif ()
if (LLAMA_AVX512) if (LLAMA_AVX512)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>) add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>) add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)

View file

@ -1,7 +1,7 @@
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = \ BUILD_TARGETS = \
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf llama-bench llava baby-llama beam-search \ simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
# Binaries only useful for tests # Binaries only useful for tests
@ -617,7 +617,10 @@ convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggm
llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ $(LDFLAGS) -Wno-cast-qual
llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)

100
cmake/FindSIMD.cmake Normal file
View file

@ -0,0 +1,100 @@
include(CheckCSourceRuns)
set(AVX_CODE "
#include <immintrin.h>
int main()
{
__m256 a;
a = _mm256_set1_ps(0);
return 0;
}
")
set(AVX512_CODE "
#include <immintrin.h>
int main()
{
__m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0);
__m512i b = a;
__mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
return 0;
}
")
set(AVX2_CODE "
#include <immintrin.h>
int main()
{
__m256i a = {0};
a = _mm256_abs_epi16(a);
__m256i x;
_mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
return 0;
}
")
set(FMA_CODE "
#include <immintrin.h>
int main()
{
__m256 acc = _mm256_setzero_ps();
const __m256 d = _mm256_setzero_ps();
const __m256 p = _mm256_setzero_ps();
acc = _mm256_fmadd_ps( d, p, acc );
return 0;
}
")
macro(check_sse type flags)
set(__FLAG_I 1)
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
foreach (__FLAG ${flags})
if (NOT ${type}_FOUND)
set(CMAKE_REQUIRED_FLAGS ${__FLAG})
check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I})
if (HAS_${type}_${__FLAG_I})
set(${type}_FOUND TRUE CACHE BOOL "${type} support")
set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
endif()
math(EXPR __FLAG_I "${__FLAG_I}+1")
endif()
endforeach()
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
if (NOT ${type}_FOUND)
set(${type}_FOUND FALSE CACHE BOOL "${type} support")
set(${type}_FLAGS "" CACHE STRING "${type} flags")
endif()
mark_as_advanced(${type}_FOUND ${type}_FLAGS)
endmacro()
# flags are for MSVC only!
check_sse("AVX" " ;/arch:AVX")
if (NOT ${AVX_FOUND})
set(LLAMA_AVX OFF)
else()
set(LLAMA_AVX ON)
endif()
check_sse("AVX2" " ;/arch:AVX2")
check_sse("FMA" " ;/arch:AVX2")
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
set(LLAMA_AVX2 OFF)
else()
set(LLAMA_AVX2 ON)
endif()
check_sse("AVX512" " ;/arch:AVX512")
if (NOT ${AVX512_FOUND})
set(LLAMA_AVX512 OFF)
else()
set(LLAMA_AVX512 ON)
endif()

View file

@ -41,6 +41,7 @@ endif()
set(TARGET common) set(TARGET common)
add_library(${TARGET} STATIC add_library(${TARGET} STATIC
base64.hpp
common.h common.h
common.cpp common.cpp
sampling.h sampling.h

392
common/base64.hpp Normal file
View file

@ -0,0 +1,392 @@
/*
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org>
*/
#ifndef PUBLIC_DOMAIN_BASE64_HPP_
#define PUBLIC_DOMAIN_BASE64_HPP_
#include <cstdint>
#include <iterator>
#include <stdexcept>
#include <string>
class base64_error : public std::runtime_error
{
public:
using std::runtime_error::runtime_error;
};
class base64
{
public:
enum class alphabet
{
/** the alphabet is detected automatically */
auto_,
/** the standard base64 alphabet is used */
standard,
/** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/
url_filename_safe
};
enum class decoding_behavior
{
/** if the input is not padded, the remaining bits are ignored */
moderate,
/** if a padding character is encounter decoding is finished */
loose
};
/**
Encodes all the elements from `in_begin` to `in_end` to `out`.
@warning The source and destination cannot overlap. The destination must be able to hold at least
`required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator.
@tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than
8 bits
@tparam Output_iterator the destination; the elements written to it are from the type `char`
@param in_begin the beginning of the source
@param in_end the ending of the source
@param out the destination iterator
@param alphabet which alphabet should be used
@returns the iterator to the next element past the last element copied
@throws see `Input_iterator` and `Output_iterator`
*/
template<typename Input_iterator, typename Output_iterator>
static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
alphabet alphabet = alphabet::standard)
{
constexpr auto pad = '=';
const char* alpha = alphabet == alphabet::url_filename_safe
? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
while (in_begin != in_end) {
std::uint8_t i0 = 0, i1 = 0, i2 = 0;
// first character
i0 = static_cast<std::uint8_t>(*in_begin);
++in_begin;
*out = alpha[i0 >> 2 & 0x3f];
++out;
// part of first character and second
if (in_begin != in_end) {
i1 = static_cast<std::uint8_t>(*in_begin);
++in_begin;
*out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)];
++out;
} else {
*out = alpha[(i0 & 0x3) << 4];
++out;
// last padding
*out = pad;
++out;
// last padding
*out = pad;
++out;
break;
}
// part of second character and third
if (in_begin != in_end) {
i2 = static_cast<std::uint8_t>(*in_begin);
++in_begin;
*out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)];
++out;
} else {
*out = alpha[(i1 & 0xf) << 2];
++out;
// last padding
*out = pad;
++out;
break;
}
// rest of third
*out = alpha[i2 & 0x3f];
++out;
}
return out;
}
/**
Encodes a string.
@param str the string that should be encoded
@param alphabet which alphabet should be used
@returns the encoded base64 string
@throws see base64::encode()
*/
static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard)
{
std::string result;
result.reserve(required_encode_size(str.length()) + 1);
encode(str.begin(), str.end(), std::back_inserter(result), alphabet);
return result;
}
/**
Encodes a char array.
@param buffer the char array
@param size the size of the array
@param alphabet which alphabet should be used
@returns the encoded string
*/
static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard)
{
std::string result;
result.reserve(required_encode_size(size) + 1);
encode(buffer, buffer + size, std::back_inserter(result), alphabet);
return result;
}
/**
Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`,
in other words: inplace decoding is possible.
@warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`,
otherwise the behavior depends on the output iterator.
@tparam Input_iterator the source; the returned elements are cast to `char`
@tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t`
@param in_begin the beginning of the source
@param in_end the ending of the source
@param out the destination iterator
@param alphabet which alphabet should be used
@param behavior the behavior when an error was detected
@returns the iterator to the next element past the last element copied
@throws base64_error depending on the set behavior
@throws see `Input_iterator` and `Output_iterator`
*/
template<typename Input_iterator, typename Output_iterator>
static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate)
{
//constexpr auto pad = '=';
std::uint8_t last = 0;
auto bits = 0;
while (in_begin != in_end) {
auto c = *in_begin;
++in_begin;
if (c == '=') {
break;
}
auto part = _base64_value(alphabet, c);
// enough bits for one byte
if (bits + 6 >= 8) {
*out = (last << (8 - bits)) | (part >> (bits - 2));
++out;
bits -= 2;
} else {
bits += 6;
}
last = part;
}
// check padding
if (behavior != decoding_behavior::loose) {
while (in_begin != in_end) {
auto c = *in_begin;
++in_begin;
if (c != '=') {
throw base64_error("invalid base64 character.");
}
}
}
return out;
}
/**
Decodes a string.
@param str the base64 encoded string
@param alphabet which alphabet should be used
@param behavior the behavior when an error was detected
@returns the decoded string
@throws see base64::decode()
*/
static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate)
{
std::string result;
result.reserve(max_decode_size(str.length()));
decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior);
return result;
}
/**
Decodes a string.
@param buffer the base64 encoded buffer
@param size the size of the buffer
@param alphabet which alphabet should be used
@param behavior the behavior when an error was detected
@returns the decoded string
@throws see base64::decode()
*/
static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate)
{
std::string result;
result.reserve(max_decode_size(size));
decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior);
return result;
}
/**
Decodes a string inplace.
@param[in,out] str the base64 encoded string
@param alphabet which alphabet should be used
@param behavior the behavior when an error was detected
@throws base64::decode_inplace()
*/
static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate)
{
str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin());
}
/**
Decodes a char array inplace.
@param[in,out] str the string array
@param size the length of the array
@param alphabet which alphabet should be used
@param behavior the behavior when an error was detected
@returns the pointer to the next element past the last element decoded
@throws base64::decode_inplace()
*/
static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate)
{
return decode(str, str + size, str, alphabet, behavior);
}
/**
Returns the required decoding size for a given size. The value is calculated with the following formula:
$$
\lceil \frac{size}{4} \rceil \cdot 3
$$
@param size the size of the encoded input
@returns the size of the resulting decoded buffer; this the absolute maximum
*/
static std::size_t max_decode_size(std::size_t size) noexcept
{
return (size / 4 + (size % 4 ? 1 : 0)) * 3;
}
/**
Returns the required encoding size for a given size. The value is calculated with the following formula:
$$
\lceil \frac{size}{3} \rceil \cdot 4
$$
@param size the size of the decoded input
@returns the size of the resulting encoded buffer
*/
static std::size_t required_encode_size(std::size_t size) noexcept
{
return (size / 3 + (size % 3 ? 1 : 0)) * 4;
}
private:
static std::uint8_t _base64_value(alphabet& alphabet, char c)
{
if (c >= 'A' && c <= 'Z') {
return c - 'A';
} else if (c >= 'a' && c <= 'z') {
return c - 'a' + 26;
} else if (c >= '0' && c <= '9') {
return c - '0' + 52;
}
// comes down to alphabet
if (alphabet == alphabet::standard) {
if (c == '+') {
return 62;
} else if (c == '/') {
return 63;
}
} else if (alphabet == alphabet::url_filename_safe) {
if (c == '-') {
return 62;
} else if (c == '_') {
return 63;
}
} // auto detect
else {
if (c == '+') {
alphabet = alphabet::standard;
return 62;
} else if (c == '/') {
alphabet = alphabet::standard;
return 63;
} else if (c == '-') {
alphabet = alphabet::url_filename_safe;
return 62;
} else if (c == '_') {
alphabet = alphabet::url_filename_safe;
return 63;
}
}
throw base64_error("invalid base64 character.");
}
};
#endif // !PUBLIC_DOMAIN_BASE64_HPP_

View file

@ -90,6 +90,19 @@ void process_escapes(std::string& input) {
case '\'': input[output_idx++] = '\''; break; case '\'': input[output_idx++] = '\''; break;
case '\"': input[output_idx++] = '\"'; break; case '\"': input[output_idx++] = '\"'; break;
case '\\': input[output_idx++] = '\\'; break; case '\\': input[output_idx++] = '\\'; break;
case 'x':
// Handle \x12, etc
if (input_idx + 2 < input_len) {
const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
char *err_p = nullptr;
const long val = std::strtol(x, &err_p, 16);
if (err_p == x + 2) {
input_idx += 2;
input[output_idx++] = char(val);
break;
}
}
// fall through
default: input[output_idx++] = '\\'; default: input[output_idx++] = '\\';
input[output_idx++] = input[input_idx]; break; input[output_idx++] = input[input_idx]; break;
} }

View file

@ -643,7 +643,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
return ggml_rope_custom(ctx, return ggml_rope_custom(ctx,
t, KQ_pos, n_rot, rope_mode, n_ctx, 0, t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
rope_freq_base, rope_freq_scale, 0.0f, 0.0f, 0.0f, 0.0f rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
); );
}; };

View file

@ -1,14 +1,36 @@
set(TARGET clip) add_library(llava OBJECT
add_library(${TARGET} clip.cpp clip.h) llava.cpp
install(TARGETS ${TARGET} LIBRARY) llava.h
target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT}) clip.cpp
target_compile_features(${TARGET} PRIVATE cxx_std_11) clip.h
if (NOT MSVC) )
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
target_link_libraries(llava PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(llava PUBLIC .)
target_include_directories(llava PUBLIC ../..)
target_include_directories(llava PUBLIC ../../common)
target_compile_features(llava PRIVATE cxx_std_11)
add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
if (BUILD_SHARED_LIBS)
set_target_properties(llava PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(llava PRIVATE LLAMA_SHARED LLAMA_BUILD)
add_library(llava_shared SHARED $<TARGET_OBJECTS:llava>)
target_link_libraries(llava_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS llava_shared LIBRARY)
endif() endif()
set(TARGET llava) if (NOT MSVC)
add_executable(${TARGET} llava.cpp) target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
install(TARGETS ${TARGET} RUNTIME) endif()
target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT}) if(TARGET BUILD_INFO)
target_compile_features(${TARGET} PRIVATE cxx_std_11) add_dependencies(llava BUILD_INFO)
endif()
set(TARGET llava-cli)
add_executable(llava-cli llava-cli.cpp)
install(TARGETS llava-cli RUNTIME)
target_link_libraries(llava-cli PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(llava PRIVATE cxx_std_11)

View file

@ -9,12 +9,12 @@ models are available.
After API is confirmed, more models will be supported / uploaded. After API is confirmed, more models will be supported / uploaded.
## Usage ## Usage
Build with cmake or run `make llava` to build it. Build with cmake or run `make llava-cli` to build it.
After building, run: `./llava` to see the usage. For example: After building, run: `./llava-cli` to see the usage. For example:
```sh ```sh
./llava -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg ./llava-cli -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
``` ```
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
@ -51,7 +51,6 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director
## TODO ## TODO
- [ ] Support server mode.
- [ ] Support non-CPU backend for the image encoding part. - [ ] Support non-CPU backend for the image encoding part.
- [ ] Support different sampling methods. - [ ] Support different sampling methods.
- [ ] Support more model variants. - [ ] Support more model variants.

View file

@ -680,26 +680,44 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
return new_clip; return new_clip;
} }
clip_image_u8 * make_clip_image_u8() { return new clip_image_u8(); } clip_image_u8 * make_clip_image_u8() {
auto img = new clip_image_u8();
return img;
}
clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); } clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); }
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { void clip_image_u8_free(clip_image_u8 * img) { if (img->data) { delete[] img->data; } delete img; }
int nx, ny, nc; void clip_image_f32_free(clip_image_f32 * img) { if (img->data) { delete[] img->data; } delete img; }
auto data = stbi_load(fname, &nx, &ny, &nc, 3);
if (!data) {
fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname);
return false;
}
static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
img->nx = nx; img->nx = nx;
img->ny = ny; img->ny = ny;
img->size = nx * ny * 3; img->size = nx * ny * 3;
img->data = new uint8_t[img->size](); img->data = new uint8_t[img->size]();
memcpy(img->data, data, img->size); memcpy(img->data, data, img->size);
}
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
int nx, ny, nc;
auto data = stbi_load(fname, &nx, &ny, &nc, 3);
if (!data) {
fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
return false;
}
build_clip_img_from_data(data, nx, ny, img);
stbi_image_free(data); stbi_image_free(data);
return true;
}
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
int nx, ny, nc;
auto data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
if (!data) {
fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
return false;
}
build_clip_img_from_data(data, nx, ny, img);
stbi_image_free(data);
return true; return true;
} }
@ -714,39 +732,40 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
clip_image_u8 temp; // we will keep the input image data here temporarily clip_image_u8 * temp = make_clip_image_u8(); // we will keep the input image data here temporarily
if (pad2square && img->nx != img->ny) { if (pad2square && img->nx != img->ny) {
int longer_side = std::max(img->nx, img->ny); int longer_side = std::max(img->nx, img->ny);
temp.nx = longer_side; temp->nx = longer_side;
temp.ny = longer_side; temp->ny = longer_side;
temp.size = 3 * longer_side * longer_side; temp->size = 3 * longer_side * longer_side;
temp.data = new uint8_t[temp.size](); temp->data = new uint8_t[temp->size]();
uint8_t bc[3] = {122, 116, 104}; // bakground color in RGB from LLaVA uint8_t bc[3] = {122, 116, 104}; // bakground color in RGB from LLaVA
// fill with background color // fill with background color
for (size_t i = 0; i < temp.size; i++) { for (size_t i = 0; i < temp->size; i++) {
temp.data[i] = bc[i % 3]; temp->data[i] = bc[i % 3];
} }
// copy from the input image // copy from the input image
for (int y = 0; y < img->ny; y++) { for (int y = 0; y < img->ny; y++) {
for (int x = 0; x < img->nx; x++) { for (int x = 0; x < img->nx; x++) {
const int i = 3 * (y * img->nx + x); const int i = 3 * (y * img->nx + x);
const int j = 3 * (y * temp.nx + x); const int j = 3 * (y * temp->nx + x);
temp.data[j] = img->data[i]; temp->data[j] = img->data[i];
temp.data[j+1] = img->data[i+1]; temp->data[j+1] = img->data[i+1];
temp.data[j+2] = img->data[i+2]; temp->data[j+2] = img->data[i+2];
} }
} }
} else { } else {
temp.nx = img->nx; temp->nx = img->nx;
temp.ny = img->ny; temp->ny = img->ny;
temp.size = img->size; temp->size = img->size;
temp.data = img->data; temp->data = new uint8_t[temp->size]();
*temp->data = *img->data; // copy
} }
const int nx = temp.nx; const int nx = temp->nx;
const int ny = temp.ny; const int ny = temp->ny;
const int nx2 = ctx->vision_model.hparams.image_size; const int nx2 = ctx->vision_model.hparams.image_size;
const int ny2 = ctx->vision_model.hparams.image_size; const int ny2 = ctx->vision_model.hparams.image_size;
@ -785,10 +804,10 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
const int j10 = 3 * (y1 * nx + x0) + c; const int j10 = 3 * (y1 * nx + x0) + c;
const int j11 = 3 * (y1 * nx + x1) + c; const int j11 = 3 * (y1 * nx + x1) + c;
const float v00 = temp.data[j00]; const float v00 = temp->data[j00];
const float v01 = temp.data[j01]; const float v01 = temp->data[j01];
const float v10 = temp.data[j10]; const float v10 = temp->data[j10];
const float v11 = temp.data[j11]; const float v11 = temp->data[j11];
const float v0 = v00 * (1.0f - dx) + v01 * dx; const float v0 = v00 * (1.0f - dx) + v01 * dx;
const float v1 = v10 * (1.0f - dx) + v11 * dx; const float v1 = v10 * (1.0f - dx) + v11 * dx;
@ -803,6 +822,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
} }
} }
} }
clip_image_u8_free(temp);
return true; return true;
} }
@ -1049,16 +1069,16 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
return true; return true;
} }
int clip_n_mmproj_embd(struct clip_ctx * ctx) { int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->vision_model.mm_2_b->ne[0]; return ctx->vision_model.mm_2_b->ne[0];
} }
int clip_n_patches(struct clip_ctx * ctx) { int clip_n_patches(const struct clip_ctx * ctx) {
auto & params = ctx->vision_model.hparams; auto & params = ctx->vision_model.hparams;
return (params.image_size / params.patch_size) * (params.image_size / params.patch_size); return (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
} }
size_t clip_embd_nbytes(struct clip_ctx * ctx) { size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
} }

View file

@ -1,7 +1,22 @@
#ifndef CLIP_H #ifndef CLIP_H
#define CLIP_H #define CLIP_H
#include "ggml.h" #include <stddef.h>
#include <stdint.h>
#ifdef LLAMA_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef LLAMA_BUILD
# define CLIP_API __declspec(dllexport)
# else
# define CLIP_API __declspec(dllimport)
# endif
# else
# define CLIP_API __attribute__ ((visibility ("default")))
# endif
#else
# define CLIP_API
#endif
struct clip_ctx; struct clip_ctx;
@ -20,19 +35,20 @@ struct clip_vision_hparams {
float eps; float eps;
}; };
struct clip_ctx * clip_model_load(const char * fname, const int verbosity); /** load mmproj model */
CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
/** free mmproj model */
CLIP_API void clip_free(struct clip_ctx * ctx);
void clip_free(struct clip_ctx * ctx); size_t clip_embd_nbytes(const struct clip_ctx * ctx);
int clip_n_patches(const struct clip_ctx * ctx);
size_t clip_embd_nbytes(struct clip_ctx * ctx); int clip_n_mmproj_embd(const struct clip_ctx * ctx);
int clip_n_patches(struct clip_ctx * ctx);
int clip_n_mmproj_embd(struct clip_ctx * ctx);
// RGB uint8 image // RGB uint8 image
struct clip_image_u8 { struct clip_image_u8 {
int nx; int nx;
int ny; int ny;
uint8_t * data; uint8_t * data = NULL;
size_t size; size_t size;
}; };
@ -41,7 +57,7 @@ struct clip_image_u8 {
struct clip_image_f32 { struct clip_image_f32 {
int nx; int nx;
int ny; int ny;
float * data; float * data = NULL;
size_t size; size_t size;
}; };
@ -57,7 +73,12 @@ struct clip_image_f32_batch {
struct clip_image_u8 * make_clip_image_u8(); struct clip_image_u8 * make_clip_image_u8();
struct clip_image_f32 * make_clip_image_f32(); struct clip_image_f32 * make_clip_image_f32();
bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); CLIP_API void clip_image_u8_free(clip_image_u8 * img);
CLIP_API void clip_image_f32_free(clip_image_f32 * img);
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square); bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec); bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);

View file

@ -0,0 +1,313 @@
#include "ggml.h"
#include "common.h"
#include "clip.h"
#include "llava.h"
#include "llama.h"
#include "base64.hpp"
#include <cstdio>
#include <cstdlib>
#include <vector>
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
int N = (int) tokens.size();
for (int i = 0; i < N; i += n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
fprintf(stderr, "%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}
*n_past += n_eval;
}
return true;
}
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
std::vector<llama_token> tokens;
tokens.push_back(id);
return eval_tokens(ctx_llama, tokens, 1, n_past);
}
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
std::string str2 = str;
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
return true;
}
// TODO: use common/sampling.h
static llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
auto & sparams = params.sparams;
// out of user input, sample next token
const float temp = sparams.temp;
const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
const float top_p = sparams.top_p;
const float tfs_z = sparams.tfs_z;
const float typical_p = sparams.typical_p;
// const int32_t repeat_last_n = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
// const float repeat_penalty = sparams.repeat_penalty;
// const float alpha_presence = sparams.presence_penalty;
// const float alpha_frequency = sparams.frequency_penalty;
const int mirostat = sparams.mirostat;
const float mirostat_tau = sparams.mirostat_tau;
const float mirostat_eta = sparams.mirostat_eta;
// const bool penalize_nl = sparams.penalize_nl;
llama_token id = 0;
{
auto logits = llama_get_logits(ctx_llama);
auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
// Apply params.logit_bias map
for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx_llama, &candidates_p);
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temp(ctx_llama, &candidates_p, temp);
id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temp(ctx_llama, &candidates_p, temp);
id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1);
llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1);
llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
llama_sample_temp(ctx_llama, &candidates_p, temp);
id = llama_sample_token(ctx_llama, &candidates_p);
}
}
}
return id;
}
static const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
int id = sample_id(ctx_llama, params);
static std::string ret;
if (id == llama_token_eos(llama_get_model(ctx_llama))) {
ret = "</s>";
} else {
ret = llama_token_to_piece(ctx_llama, id);
}
eval_id(ctx_llama, id, n_past);
return ret.c_str();
}
static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
static const char* IMG_BASE64_TAG_END = "\">";
static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
}
static bool prompt_contains_image(const std::string& prompt) {
size_t begin, end;
find_image_tag_in_prompt(prompt, begin, end);
return (begin != std::string::npos);
}
// replaces the base64 image tag in the prompt with `replacement`
static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
size_t img_base64_str_start, img_base64_str_end;
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
fprintf(stderr, "%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
return NULL;
}
auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
auto required_bytes = base64::required_encode_size(base64_str.size());
auto img_bytes = std::vector<unsigned char>(required_bytes);
base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
if (!embed) {
fprintf(stderr, "%s: could not load image from base64 string.\n", __func__);
return NULL;
}
return embed;
}
static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
size_t begin, end;
find_image_tag_in_prompt(prompt, begin, end);
if (begin == std::string::npos || end == std::string::npos) {
return prompt;
}
auto pre = prompt.substr(0, begin);
auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
return pre + replacement + post;
}
struct llava_context {
struct clip_ctx * ctx_clip = NULL;
struct llama_context * ctx_llama = NULL;
struct llama_model * model = NULL;
};
static void show_additional_info(int /*argc*/, char ** argv) {
printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
printf(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {
// load and preprocess the image
llava_image_embed * embed = NULL;
auto prompt = params->prompt;
if (prompt_contains_image(prompt)) {
if (!params->image.empty()) {
printf("using base64 encoded image instead of command line image path\n");
}
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
if (!embed) {
fprintf(stderr, "%s: can't load image from prompt\n", __func__);
return NULL;
}
params->prompt = remove_image_from_prompt(prompt);
} else {
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str());
if (!embed) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str());
return NULL;
}
}
return embed;
}
static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
int n_past = 0;
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
// llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, true);
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
// generate the response
printf("\n");
for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(ctx_llava->ctx_llama, *params, &n_past);
if (strcmp(tmp, "</s>") == 0) break;
printf("%s", tmp);
fflush(stdout);
}
printf("\n");
}
static struct llava_context * llava_init(gpt_params * params) {
const char * clip_path = params->mmproj.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
llama_backend_init(params->numa);
llama_model_params model_params = llama_model_params_from_gpt_params(*params);
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return NULL;
}
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
return NULL;
}
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama;
ctx_llava->ctx_clip = ctx_clip;
ctx_llava->model = model;
return ctx_llava;
}
static void llava_free(struct llava_context * ctx_llava) {
if (ctx_llava->ctx_clip) {
clip_free(ctx_llava->ctx_clip);
ctx_llava->ctx_clip = NULL;
}
llama_free(ctx_llava->ctx_llama);
llama_free_model(ctx_llava->model);
llama_backend_free();
}
int main(int argc, char ** argv) {
ggml_time_init();
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
show_additional_info(argc, argv);
return 1;
}
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
gpt_print_usage(argc, argv, params);
show_additional_info(argc, argv);
return 1;
}
auto ctx_llava = llava_init(&params);
if (ctx_llava == NULL) {
fprintf(stderr, "%s: error: failed to init llava\n", __func__);
return 1;
}
auto image_embed = load_image(ctx_llava, &params);
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_print_timings(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
llava_free(ctx_llava);
return 0;
}

View file

@ -1,147 +0,0 @@
#pragma once
// this one and clip lib will be eventually merged to a single lib, let's keep it this way for now
#include "common.h"
#include "llama.h"
#include <cstdio>
#include <cstdlib>
#include <vector>
inline bool eval_image_embd(llama_context * ctx_llama, float * embd, int N, int n_batch, int * n_past) {
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
for (int i = 0; i < N; i += n_batch) {
int n_eval = N - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
llama_batch batch = {int32_t(n_eval), nullptr, (embd+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
if (llama_decode(ctx_llama, batch)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
*n_past += n_eval;
}
return true;
}
inline bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
int N = (int) tokens.size();
for (int i = 0; i < N; i += n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
*n_past += n_eval;
}
return true;
}
inline bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
std::vector<llama_token> tokens;
tokens.push_back(id);
return eval_tokens(ctx_llama, tokens, 1, n_past);
}
inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
std::string str2 = str;
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
return true;
}
// TODO: use common/sampling.h
inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
auto & sparams = params.sparams;
// out of user input, sample next token
const float temp = sparams.temp;
const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
const float top_p = sparams.top_p;
const float tfs_z = sparams.tfs_z;
const float typical_p = sparams.typical_p;
// const int32_t repeat_last_n = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
// const float repeat_penalty = sparams.repeat_penalty;
// const float alpha_presence = sparams.presence_penalty;
// const float alpha_frequency = sparams.frequency_penalty;
const int mirostat = sparams.mirostat;
const float mirostat_tau = sparams.mirostat_tau;
const float mirostat_eta = sparams.mirostat_eta;
// const bool penalize_nl = sparams.penalize_nl;
llama_token id = 0;
{
auto logits = llama_get_logits(ctx_llama);
auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
// Apply params.logit_bias map
for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// TODO: Apply penalties
// float nl_logit = logits[llama_token_nl(ctx)];
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
// llama_sample_repetition_penalty(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, repeat_penalty);
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, alpha_frequency, alpha_presence);
// if (!penalize_nl) {
// logits[llama_token_nl(ctx)] = nl_logit;
// }
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx_llama, &candidates_p);
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temp(ctx_llama, &candidates_p, temp);
id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temp(ctx_llama, &candidates_p, temp);
id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1);
llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1);
llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
llama_sample_temp(ctx_llama, &candidates_p, temp);
id = llama_sample_token(ctx_llama, &candidates_p);
}
}
}
return id;
}
inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
int id = sample_id(ctx_llama, params);
static std::string ret;
if (id == llama_token_eos(llama_get_model(ctx_llama))) {
ret = "</s>";
} else {
ret = llama_token_to_piece(ctx_llama, id);
}
eval_id(ctx_llama, id, n_past);
return ret.c_str();
}

View file

@ -1,164 +1,156 @@
#include "clip.h" #include "clip.h"
#include "llava-utils.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "llava.h"
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <vector> #include <vector>
static void show_additional_info(int /*argc*/, char ** argv) { #include "base64.hpp"
printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
printf(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
}
int main(int argc, char ** argv) { static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
ggml_time_init(); clip_image_f32 * img_res = make_clip_image_f32();
if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) {
gpt_params params; fprintf(stderr, "%s: unable to preprocess image\n", __func__);
clip_image_f32_free(img_res);
if (!gpt_params_parse(argc, argv, params)) { return false;
show_additional_info(argc, argv);
return 1;
} }
if (params.mmproj.empty() || params.image.empty()) { *n_img_pos = clip_n_patches(ctx_clip);
gpt_print_usage(argc, argv, params);
show_additional_info(argc, argv);
return 1;
}
const char * clip_path = params.mmproj.c_str();
const char * img_path = params.image.c_str();
if (params.prompt.empty()) {
params.prompt = "describe the image in detail.";
}
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
// load and preprocess the image
clip_image_u8 img;
clip_image_f32 img_res;
if (!clip_image_load_from_file(img_path, &img)) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, img_path);
clip_free(ctx_clip);
return 1;
}
if (!clip_image_preprocess(ctx_clip, &img, &img_res, /*pad2square =*/ true)) {
fprintf(stderr, "%s: unable to preprocess %s\n", __func__, img_path);
clip_free(ctx_clip);
return 1;
}
int n_img_pos = clip_n_patches(ctx_clip);
int n_img_embd = clip_n_mmproj_embd(ctx_clip);
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
if (!image_embd) {
fprintf(stderr, "Unable to allocate memory for image embeddings\n");
return 1;
}
const int64_t t_img_enc_start_us = ggml_time_us(); const int64_t t_img_enc_start_us = ggml_time_us();
if (!clip_image_encode(ctx_clip, params.n_threads, &img_res, image_embd)) { bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
clip_image_f32_free(img_res);
if (!encoded) {
fprintf(stderr, "Unable to encode image\n"); fprintf(stderr, "Unable to encode image\n");
return 1; return false;
} }
const int64_t t_img_enc_end_us = ggml_time_us(); const int64_t t_img_enc_end_us = ggml_time_us();
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
// we get the embeddings, free up the memory required for CLIP printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
clip_free(ctx_clip);
llama_backend_init(params.numa); return true;
}
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = params.n_gpu_layers; bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
model_params.main_gpu = params.main_gpu; // make sure that the correct mmproj was used, i.e., compare apples to apples
model_params.tensor_split = params.tensor_split; int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
model_params.use_mmap = params.use_mmap; auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
model_params.use_mlock = params.use_mlock; if (n_image_embd != n_llama_embd) {
printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); return false;
if (model == NULL) { }
fprintf(stderr , "%s: error: unable to load model\n" , __func__); return true;
return 1; }
}
static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
llama_context_params ctx_params = llama_context_default_params(); float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
if (!image_embd) {
ctx_params.n_ctx = params.n_ctx < 2048 ? 2048 : params.n_ctx; // we need a longer context size to process image embeddings fprintf(stderr, "Unable to allocate memory for image embeddings\n");
ctx_params.n_threads = params.n_threads; free(image_embd);
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; return false;
ctx_params.seed = params.seed; }
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); int n_img_pos;
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
if (ctx_llama == NULL) { fprintf(stderr, "%s: cannot encode image, aborting\n", __func__);
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); free(image_embd);
return 1; return false;
} }
*image_embd_out = image_embd;
// make sure that the correct mmproj was used, i.e., compare apples to apples *n_img_pos_out = n_img_pos;
const int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
return true;
if (n_img_embd != n_llama_embd) { }
printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_img_embd, n_llama_embd);
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
llama_free(ctx_llama); int n_embd = llama_n_embd(llama_get_model(ctx_llama));
llama_free_model(model);
llama_backend_free(); for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
free(image_embd); int n_eval = image_embed->n_image_pos - i;
if (n_eval > n_batch) {
return 1; n_eval = n_batch;
} }
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
// process the prompt if (llama_decode(ctx_llama, batch)) {
// llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:" fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
int n_past = 0; }
*n_past += n_eval;
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; }
return true;
eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params.n_batch, &n_past, true); }
eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past);
eval_string(ctx_llama, (params.prompt + "\nASSISTANT:").c_str(), params.n_batch, &n_past, false); LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
clip_image_u8 * img = make_clip_image_u8();
// generate the response if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
clip_image_u8_free(img);
printf("\n"); fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);
printf("prompt: '%s'\n", params.prompt.c_str()); return NULL;
printf("\n"); }
for (int i = 0; i < max_tgt_len; i++) { float* image_embed = NULL;
const char * tmp = sample(ctx_llama, params, &n_past); int n_image_pos = 0;
if (strcmp(tmp, "</s>") == 0) break; bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
if (!image_embed_result) {
printf("%s", tmp); clip_image_u8_free(img);
fflush(stdout); fprintf(stderr, "%s: coulnd't embed the image\n", __func__);
} return NULL;
}
printf("\n");
clip_image_u8_free(img);
{ auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed));
const float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; result->embed = image_embed;
result->n_image_pos = n_image_pos;
printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / n_img_pos); return result;
} }
llama_print_timings(ctx_llama); static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
auto file = fopen(path, "rb");
llama_free(ctx_llama); if (file == NULL) {
llama_free_model(model); fprintf(stderr, "%s: can't read file %s\n", __func__, path);
llama_backend_free(); return false;
free(image_embd); }
return 0; fseek(file, 0, SEEK_END);
auto fileSize = ftell(file);
fseek(file, 0, SEEK_SET);
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
if (buffer == NULL) {
fprintf(stderr, "%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
perror("Memory allocation error");
fclose(file);
return false;
}
fread(buffer, 1, fileSize, file); // Read the file into the buffer
fclose(file); // Close the file
*bytesOut = buffer;
*sizeOut = fileSize;
return true;
}
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
unsigned char* image_bytes;
long image_bytes_length;
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
if (!loaded) {
fprintf(stderr, "%s: failed to load %s\n", __func__, image_path);
return NULL;
}
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
free(image_bytes);
return embed;
}
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed) {
free(embed->embed);
free(embed);
} }

50
examples/llava/llava.h Normal file
View file

@ -0,0 +1,50 @@
#ifndef LLAVA_H
#define LLAVA_H
#include "ggml.h"
#ifdef LLAMA_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef LLAMA_BUILD
# define LLAVA_API __declspec(dllexport)
# else
# define LLAVA_API __declspec(dllimport)
# endif
# else
# define LLAVA_API __attribute__ ((visibility ("default")))
# endif
#else
# define LLAVA_API
#endif
struct clip_ctx;
#ifdef __cplusplus
extern "C" {
#endif
struct llava_image_embed {
float * embed;
int n_image_pos;
};
/** sanity check for clip <-> llava embed size match */
LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
/** build an image embed from image file bytes */
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
/** build an image embed from a path to an image filename */
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
/** free an embedding made with llava_image_embed_make_* */
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -6,7 +6,7 @@ install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}> SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
) )
target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
if (WIN32) if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif() endif()

View file

@ -7,7 +7,7 @@ Command line options:
- `--threads N`, `-t N`: Set the number of threads to use during generation. - `--threads N`, `-t N`: Set the number of threads to use during generation.
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.

View file

@ -39,10 +39,6 @@
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
#define cudaDeviceGetMemPool hipDeviceGetMemPool
#define cudaMemPoolAttrReleaseThreshold hipMemPoolAttrReleaseThreshold
#define cudaMemPoolSetAttribute hipMemPoolSetAttribute
#define cudaMemPool_t hipMemPool_t
#define cudaDeviceProp hipDeviceProp_t #define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize #define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t #define cudaError_t hipError_t
@ -52,7 +48,6 @@
#define cudaEvent_t hipEvent_t #define cudaEvent_t hipEvent_t
#define cudaEventDestroy hipEventDestroy #define cudaEventDestroy hipEventDestroy
#define cudaFree hipFree #define cudaFree hipFree
#define cudaFreeAsync hipFreeAsync
#define cudaFreeHost hipHostFree #define cudaFreeHost hipHostFree
#define cudaGetDevice hipGetDevice #define cudaGetDevice hipGetDevice
#define cudaGetDeviceCount hipGetDeviceCount #define cudaGetDeviceCount hipGetDeviceCount
@ -60,7 +55,6 @@
#define cudaGetErrorString hipGetErrorString #define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError #define cudaGetLastError hipGetLastError
#define cudaMalloc hipMalloc #define cudaMalloc hipMalloc
#define cudaMallocFromPoolAsync hipMallocFromPoolAsync
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
#define cudaMemcpy hipMemcpy #define cudaMemcpy hipMemcpy
#define cudaMemcpy2DAsync hipMemcpy2DAsync #define cudaMemcpy2DAsync hipMemcpy2DAsync
@ -187,11 +181,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
do { \ do { \
cudaError_t err_ = (err); \ cudaError_t err_ = (err); \
if (err_ != cudaSuccess) { \ if (err_ != cudaSuccess) { \
int dev_id; \ int id; \
cudaGetDevice(&dev_id); \ cudaGetDevice(&id); \
fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
cudaGetErrorString(err_)); \ cudaGetErrorString(err_)); \
fprintf(stderr, "current device: %d\n", dev_id); \ fprintf(stderr, "current device: %d\n", id); \
exit(1); \ exit(1); \
} \ } \
} while (0) } while (0)
@ -201,11 +195,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
do { \ do { \
cublasStatus_t err_ = (err); \ cublasStatus_t err_ = (err); \
if (err_ != CUBLAS_STATUS_SUCCESS) { \ if (err_ != CUBLAS_STATUS_SUCCESS) { \
int dev_id; \ int id; \
cudaGetDevice(&dev_id); \ cudaGetDevice(&id); \
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \ fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \ err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
fprintf(stderr, "current device: %d\n", dev_id); \ fprintf(stderr, "current device: %d\n", id); \
exit(1); \ exit(1); \
} \ } \
} while (0) } while (0)
@ -471,7 +465,6 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
#define MAX_STREAMS 8 #define MAX_STREAMS 8
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr }; static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
static cudaMemPool_t g_cudaMemPools[GGML_CUDA_MAX_DEVICES] = { nullptr };
struct ggml_tensor_extra_gpu { struct ggml_tensor_extra_gpu {
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
@ -5780,16 +5773,6 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
return ptr; return ptr;
} }
static void * ggml_cuda_pool_malloc_async(size_t size, size_t * actual_size, int id, cudaStream_t stream) {
if (g_cudaMemPools[id] == nullptr) {
return ggml_cuda_pool_malloc(size, actual_size);
}
void *ptr;
CUDA_CHECK(cudaMallocFromPoolAsync(&ptr, size, g_cudaMemPools[id], stream));
*actual_size = size;
return ptr;
}
static void ggml_cuda_pool_free(void * ptr, size_t size) { static void ggml_cuda_pool_free(void * ptr, size_t size) {
scoped_spin_lock lock(g_cuda_pool_lock); scoped_spin_lock lock(g_cuda_pool_lock);
int id; int id;
@ -5807,12 +5790,10 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
CUDA_CHECK(cudaFree(ptr)); CUDA_CHECK(cudaFree(ptr));
} }
static bool g_cublas_loaded = false;
static void ggml_cuda_pool_free_async(void * ptr, size_t actual_size, int id, cudaStream_t stream) { bool ggml_cublas_loaded(void) {
if (g_cudaMemPools[id] == nullptr) { return g_cublas_loaded;
return ggml_cuda_pool_free(ptr, actual_size);
}
CUDA_CHECK(cudaFreeAsync(ptr, stream));
} }
void ggml_init_cublas() { void ggml_init_cublas() {
@ -5827,7 +5808,12 @@ void ggml_init_cublas() {
CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaDeviceSynchronize());
#endif #endif
CUDA_CHECK(cudaGetDeviceCount(&g_device_count)); if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
initialized = true;
g_cublas_loaded = false;
return;
}
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
int64_t total_vram = 0; int64_t total_vram = 0;
#if defined(GGML_CUDA_FORCE_MMQ) #if defined(GGML_CUDA_FORCE_MMQ)
@ -5869,19 +5855,13 @@ void ggml_init_cublas() {
// create cublas handle // create cublas handle
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id])); CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH)); CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
// configure memory pool
cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id);
if (err == cudaSuccess) {
size_t treshold = UINT64_MAX;
CUDA_CHECK(cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold));
}
} }
// configure logging to stdout // configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
initialized = true; initialized = true;
g_cublas_loaded = true;
} }
} }
@ -6469,7 +6449,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type); const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
GGML_ASSERT(to_fp16_cuda != nullptr); GGML_ASSERT(to_fp16_cuda != nullptr);
size_t ne = row_diff*ne00; size_t ne = row_diff*ne00;
src0_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &src0_as, id, stream); src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream); to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
} }
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16; const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
@ -6480,12 +6460,13 @@ inline void ggml_cuda_op_mul_mat_cublas(
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
GGML_ASSERT(to_fp16_cuda != nullptr); GGML_ASSERT(to_fp16_cuda != nullptr);
size_t ne = src1_ncols*ne10; size_t ne = src1_ncols*ne10;
src1_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &src1_as, id, stream); src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream); to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
} }
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16; const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
size_t dst_f16_as = 0;
half * dst_f16 = (half *) ggml_cuda_pool_malloc_async(row_diff*src1_ncols * sizeof(half), &dst_f16_as, id, stream); size_t dst_as = 0;
half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
const half alpha_f16 = 1.0f; const half alpha_f16 = 1.0f;
const half beta_f16 = 0.0f; const half beta_f16 = 0.0f;
@ -6503,15 +6484,14 @@ inline void ggml_cuda_op_mul_mat_cublas(
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream); to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
if (dst_f16_as != 0) { ggml_cuda_pool_free(dst_f16, dst_as);
ggml_cuda_pool_free_async(dst_f16, dst_f16_as, id, stream);
}
if (src0_as != 0) { if (src0_as != 0) {
ggml_cuda_pool_free_async(src0_as_f16, src0_as, id, stream); ggml_cuda_pool_free(src0_as_f16, src0_as);
} }
if (src1_as != 0) { if (src1_as != 0) {
ggml_cuda_pool_free_async(src1_as_f16, src1_as, id, stream); ggml_cuda_pool_free(src1_as_f16, src1_as);
} }
} }
else { else {
@ -6521,7 +6501,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
if (src0->type != GGML_TYPE_F32) { if (src0->type != GGML_TYPE_F32) {
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
GGML_ASSERT(to_fp32_cuda != nullptr); GGML_ASSERT(to_fp32_cuda != nullptr);
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc_async(row_diff*ne00 * sizeof(float), &src0_as, id, stream); // NOLINT src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream); to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
} }
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32; const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
@ -6538,7 +6518,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
&beta, dst_dd_i, ldc)); &beta, dst_dd_i, ldc));
if (src0_as != 0) { if (src0_as != 0) {
ggml_cuda_pool_free_async(src0_ddq_as_f32, src0_as, id, stream); ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
} }
} }
@ -6924,6 +6904,8 @@ static void ggml_cuda_op_mul_mat(
int64_t row_low[GGML_CUDA_MAX_DEVICES]; int64_t row_low[GGML_CUDA_MAX_DEVICES];
int64_t row_high[GGML_CUDA_MAX_DEVICES]; int64_t row_high[GGML_CUDA_MAX_DEVICES];
int used_devices = 0;
for (int64_t id = 0; id < g_device_count; ++id) { for (int64_t id = 0; id < g_device_count; ++id) {
// by default, use all rows // by default, use all rows
row_low[id] = 0; row_low[id] = 0;
@ -6951,6 +6933,8 @@ static void ggml_cuda_op_mul_mat(
continue; continue;
} }
used_devices++;
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device; const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
@ -6961,22 +6945,21 @@ static void ggml_cuda_op_mul_mat(
src0_dd[id] = (char *) src0_extra->data_device[id]; src0_dd[id] = (char *) src0_extra->data_device[id];
} else { } else {
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0); const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
src0_dd[id] = (char *) ggml_cuda_pool_malloc_async(ggml_nbytes(src0), &src0_as[id], id, stream); src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
} }
if (src1_on_device && src1_is_contiguous) { if (src1_on_device && src1_is_contiguous) {
src1_ddf[id] = (float *) src1_extra->data_device[id]; src1_ddf[id] = (float *) src1_extra->data_device[id];
} else { } else {
src1_ddf[id] = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(src1), &src1_asf[id], id, stream); src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
} }
if (convert_src1_to_q8_1) { if (convert_src1_to_q8_1) {
const size_t size_dst_ddq = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs; src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
src1_ddq[id] = (char *) ggml_cuda_pool_malloc_async(size_dst_ddq, &src1_asq[id], id, stream);
if (src1_on_device && src1_is_contiguous) { if (src1_on_device && src1_is_contiguous) {
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream); quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
// CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
} }
} }
@ -6984,18 +6967,18 @@ static void ggml_cuda_op_mul_mat(
dst_dd[id] = (float *) dst_extra->data_device[id]; dst_dd[id] = (float *) dst_extra->data_device[id];
} else { } else {
const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst); const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
dst_dd[id] = (float *) ggml_cuda_pool_malloc_async(size_dst_ddf, &dst_as[id], id, stream); dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
} }
} }
// if multiple devices are used they need to wait for the main device // if multiple devices are used they need to wait for the main device
// here an event is recorded that signals that the main device has finished calculating the input data // here an event is recorded that signals that the main device has finished calculating the input data
if (split && g_device_count > 1) { if (split && used_devices > 1) {
CUDA_CHECK(ggml_cuda_set_device(g_main_device)); CUDA_CHECK(ggml_cuda_set_device(g_main_device));
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0])); CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
} }
const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11; const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) { for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0; const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride; const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
@ -7110,6 +7093,27 @@ static void ggml_cuda_op_mul_mat(
} }
} }
for (int64_t id = 0; id < g_device_count; ++id) {
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
continue;
}
CUDA_CHECK(ggml_cuda_set_device(id));
// free buffers again when done
if (src0_as[id] > 0) {
ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
}
if (src1_asf[id] > 0) {
ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
}
if (src1_asq[id] > 0) {
ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
}
if (dst_as[id] > 0) {
ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
}
}
// main device waits for all other devices to be finished // main device waits for all other devices to be finished
if (split && g_device_count > 1) { if (split && g_device_count > 1) {
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE; int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
@ -7117,6 +7121,9 @@ static void ggml_cuda_op_mul_mat(
CUDA_CHECK(ggml_cuda_set_device(g_main_device)); CUDA_CHECK(ggml_cuda_set_device(g_main_device));
for (int64_t id = 0; id < g_device_count; ++id) { for (int64_t id = 0; id < g_device_count; ++id) {
if (row_low[id] == row_high[id]) {
continue;
}
for (int64_t is = 0; is < is_max; ++is) { for (int64_t is = 0; is < is_max; ++is) {
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0)); CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
} }
@ -7127,21 +7134,6 @@ static void ggml_cuda_op_mul_mat(
CUDA_CHECK(ggml_cuda_set_device(g_main_device)); CUDA_CHECK(ggml_cuda_set_device(g_main_device));
CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaDeviceSynchronize());
} }
for (int64_t id = 0; id < g_device_count; ++id) {
if (src0_as[id] > 0) {
ggml_cuda_pool_free_async(src0_dd[id], src0_as[id], id, g_cudaStreams[id][0]);
}
if (src1_asf[id] > 0) {
ggml_cuda_pool_free_async(src1_ddf[id], src1_asf[id], id, g_cudaStreams[id][0]);
}
if (src1_asq[id] > 0) {
ggml_cuda_pool_free_async(src1_ddq[id], src1_asq[id], id, g_cudaStreams[id][0]);
}
if (dst_as[id] > 0) {
ggml_cuda_pool_free_async(dst_dd[id], dst_as[id], id, g_cudaStreams[id][0]);
}
}
} }
static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -7177,6 +7169,8 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
} }
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
if (!g_cublas_loaded) return false;
const int64_t ne10 = src1->ne[0]; const int64_t ne10 = src1->ne[0];
const int64_t ne0 = dst->ne[0]; const int64_t ne0 = dst->ne[0];
@ -7328,11 +7322,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
GGML_ASSERT(to_fp16_cuda != nullptr); GGML_ASSERT(to_fp16_cuda != nullptr);
size_t src1_as = 0; size_t src1_as = 0;
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne1 * sizeof(half), &src1_as, id, main_stream); half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream); to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
size_t dst_as = 0; size_t dst_as = 0;
half * dst_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &dst_as, id, main_stream); half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
GGML_ASSERT(ne12 % ne02 == 0); GGML_ASSERT(ne12 % ne02 == 0);
GGML_ASSERT(ne13 % ne03 == 0); GGML_ASSERT(ne13 % ne03 == 0);
@ -7386,8 +7380,8 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
size_t ptrs_src_s = 0; size_t ptrs_src_s = 0;
size_t ptrs_dst_s = 0; size_t ptrs_dst_s = 0;
ptrs_src = (const void **) ggml_cuda_pool_malloc_async(2*ne23*sizeof(void *), &ptrs_src_s, id, main_stream); ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
ptrs_dst = ( void **) ggml_cuda_pool_malloc_async(1*ne23*sizeof(void *), &ptrs_dst_s, id, main_stream); ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
dim3 block_dims(ne13, ne12); dim3 block_dims(ne13, ne12);
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>( k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
@ -7400,6 +7394,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
dst->nb[2], dst->nb[3], dst->nb[2], dst->nb[3],
r2, r3); r2, r3);
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
CUBLAS_CHECK( CUBLAS_CHECK(
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10, ne01, ne11, ne10,
@ -7411,30 +7406,29 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
CUBLAS_GEMM_DEFAULT_TENSOR_OP)); CUBLAS_GEMM_DEFAULT_TENSOR_OP));
if (ptrs_src_s != 0) { if (ptrs_src_s != 0) {
ggml_cuda_pool_free_async(ptrs_src, ptrs_src_s, id, main_stream); ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
} }
if (ptrs_dst_s != 0) { if (ptrs_dst_s != 0) {
ggml_cuda_pool_free_async(ptrs_dst, ptrs_dst_s, id, main_stream); ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
} }
} }
#endif #endif
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream); to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
if (src1_as != 0) {
ggml_cuda_pool_free_async(src1_as_f16, src1_as, id, main_stream); ggml_cuda_pool_free(src1_as_f16, src1_as);
} ggml_cuda_pool_free(dst_f16, dst_as);
if (dst_as != 0) {
ggml_cuda_pool_free_async(dst_f16, dst_as, id, main_stream);
}
} }
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool all_on_device = const bool all_on_device =
(src0->backend == GGML_BACKEND_GPU) && (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
(src1->backend == GGML_BACKEND_GPU) && (src1->backend == GGML_BACKEND_GPU) &&
( dst->backend == GGML_BACKEND_GPU); ( dst->backend == GGML_BACKEND_GPU);
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
int64_t min_compute_capability = INT_MAX; int64_t min_compute_capability = INT_MAX;
for (int64_t id = 0; id < g_device_count; ++id) { for (int64_t id = 0; id < g_device_count; ++id) {
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
@ -7456,13 +7450,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
// KQ single-batch // KQ single-batch
ggml_cuda_mul_mat_vec_p021(src0, src1, dst); ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
} else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
// KQV single-batch // KQV single-batch
ggml_cuda_mul_mat_vec_nc(src0, src1, dst); ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
} else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) { } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
// KQ + KQV multi-batch // KQ + KQV multi-batch
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst); ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
} else if (src0->type == GGML_TYPE_F32) { } else if (src0->type == GGML_TYPE_F32) {
@ -7862,6 +7856,8 @@ void ggml_cuda_free_scratch() {
} }
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
if (!g_cublas_loaded) return false;
ggml_cuda_func_t func; ggml_cuda_func_t func;
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))

View file

@ -17,7 +17,12 @@ extern "C" {
#define GGML_CUDA_MAX_DEVICES 16 #define GGML_CUDA_MAX_DEVICES 16
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
GGML_API void ggml_init_cublas(void); GGML_API void ggml_init_cublas(void);
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
GGML_API bool ggml_cublas_loaded(void);
GGML_API void * ggml_cuda_host_malloc(size_t size); GGML_API void * ggml_cuda_host_malloc(size_t size);
GGML_API void ggml_cuda_host_free(void * ptr); GGML_API void ggml_cuda_host_free(void * ptr);

330
ggml.c
View file

@ -4970,8 +4970,13 @@ struct ggml_tensor * ggml_rope_back(
int n_dims, int n_dims,
int mode, int mode,
int n_ctx, int n_ctx,
int n_orig_ctx,
float freq_base, float freq_base,
float freq_scale, float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow,
float xpos_base, float xpos_base,
bool xpos_down) { bool xpos_down) {
GGML_ASSERT(ggml_is_vector(b)); GGML_ASSERT(ggml_is_vector(b));
@ -4988,11 +4993,15 @@ struct ggml_tensor * ggml_rope_back(
struct ggml_tensor * result = ggml_dup_tensor(ctx, a); struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx }; int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
memcpy(params + 4, &freq_base, sizeof(float)); memcpy(params + 5, &freq_base, sizeof(float));
memcpy(params + 5, &freq_scale, sizeof(float)); memcpy(params + 6, &freq_scale, sizeof(float));
memcpy(params + 6, &xpos_base, sizeof(float)); memcpy(params + 7, &ext_factor, sizeof(float));
memcpy(params + 7, &xpos_down, sizeof(bool)); memcpy(params + 8, &attn_factor, sizeof(float));
memcpy(params + 9, &beta_fast, sizeof(float));
memcpy(params + 10, &beta_slow, sizeof(float));
memcpy(params + 11, &xpos_base, sizeof(float));
memcpy(params + 12, &xpos_down, sizeof(bool));
ggml_set_op_params(result, params, sizeof(params)); ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_ROPE_BACK; result->op = GGML_OP_ROPE_BACK;
@ -10974,7 +10983,8 @@ static void ggml_compute_forward_rope_f32(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst,
const bool forward) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return; return;
} }
@ -11033,6 +11043,11 @@ static void ggml_compute_forward_rope_f32(
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
const bool is_glm = mode & 4; const bool is_glm = mode & 4;
// backward process uses inverse rotation by cos and sin.
// cos and sin build a rotation matrix, where the inverse is the transpose.
// this essentially just switches the sign of sin.
const float sin_sign = forward ? 1.0f : -1.0f;
const int32_t * pos = (const int32_t *) src1->data; const int32_t * pos = (const int32_t *) src1->data;
for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i3 = 0; i3 < ne3; i3++) {
@ -11049,9 +11064,9 @@ static void ggml_compute_forward_rope_f32(
float block_theta = MAX(p - (n_ctx - 2), 0); float block_theta = MAX(p - (n_ctx - 2), 0);
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
const float cos_theta = cosf(theta_base); const float cos_theta = cosf(theta_base);
const float sin_theta = sinf(theta_base); const float sin_theta = sinf(theta_base) * sin_sign;
const float cos_block_theta = cosf(block_theta); const float cos_block_theta = cosf(block_theta);
const float sin_block_theta = sinf(block_theta); const float sin_block_theta = sinf(block_theta) * sin_sign;
theta_base *= theta_scale; theta_base *= theta_scale;
block_theta *= theta_scale; block_theta *= theta_scale;
@ -11075,6 +11090,7 @@ static void ggml_compute_forward_rope_f32(
rope_yarn( rope_yarn(
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
); );
sin_theta *= sin_sign;
// zeta scaling for xPos only: // zeta scaling for xPos only:
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
@ -11105,6 +11121,7 @@ static void ggml_compute_forward_rope_f32(
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
&cos_theta, &sin_theta &cos_theta, &sin_theta
); );
sin_theta *= sin_sign;
theta_base *= theta_scale; theta_base *= theta_scale;
@ -11130,7 +11147,8 @@ static void ggml_compute_forward_rope_f16(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst,
const bool forward) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return; return;
} }
@ -11182,6 +11200,11 @@ static void ggml_compute_forward_rope_f16(
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
const bool is_glm = mode & 4; const bool is_glm = mode & 4;
// backward process uses inverse rotation by cos and sin.
// cos and sin build a rotation matrix, where the inverse is the transpose.
// this essentially just switches the sign of sin.
const float sin_sign = forward ? 1.0f : -1.0f;
const int32_t * pos = (const int32_t *) src1->data; const int32_t * pos = (const int32_t *) src1->data;
for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i3 = 0; i3 < ne3; i3++) {
@ -11198,9 +11221,9 @@ static void ggml_compute_forward_rope_f16(
float block_theta = MAX(p - (n_ctx - 2), 0); float block_theta = MAX(p - (n_ctx - 2), 0);
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
const float cos_theta = cosf(theta_base); const float cos_theta = cosf(theta_base);
const float sin_theta = sinf(theta_base); const float sin_theta = sinf(theta_base) * sin_sign;
const float cos_block_theta = cosf(block_theta); const float cos_block_theta = cosf(block_theta);
const float sin_block_theta = sinf(block_theta); const float sin_block_theta = sinf(block_theta) * sin_sign;
theta_base *= theta_scale; theta_base *= theta_scale;
block_theta *= theta_scale; block_theta *= theta_scale;
@ -11224,6 +11247,7 @@ static void ggml_compute_forward_rope_f16(
rope_yarn( rope_yarn(
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
); );
sin_theta *= sin_sign;
theta_base *= theta_scale; theta_base *= theta_scale;
@ -11250,6 +11274,7 @@ static void ggml_compute_forward_rope_f16(
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
&cos_theta, &sin_theta &cos_theta, &sin_theta
); );
sin_theta *= sin_sign;
theta_base *= theta_scale; theta_base *= theta_scale;
@ -11279,11 +11304,11 @@ static void ggml_compute_forward_rope(
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F16: case GGML_TYPE_F16:
{ {
ggml_compute_forward_rope_f16(params, src0, src1, dst); ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
} break; } break;
case GGML_TYPE_F32: case GGML_TYPE_F32:
{ {
ggml_compute_forward_rope_f32(params, src0, src1, dst); ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
} break; } break;
default: default:
{ {
@ -11294,216 +11319,6 @@ static void ggml_compute_forward_rope(
// ggml_compute_forward_rope_back // ggml_compute_forward_rope_back
static void ggml_compute_forward_rope_back_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
// y = rope(x, src1)
// dx = rope_back(dy, src1)
// src0 is dy, src1 contains options
float freq_base;
float freq_scale;
// these two only relevant for xPos RoPE:
float xpos_base;
bool xpos_down;
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2];
const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
GGML_TENSOR_UNARY_OP_LOCALS
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
assert(nb0 == sizeof(float));
const int ith = params->ith;
const int nth = params->nth;
const int nr = ggml_nrows(dst);
// rows per thread
const int dr = (nr + nth - 1)/nth;
// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
// row index used to determine which thread to use
int ir = 0;
const float theta_scale = powf(freq_base, -2.0f/n_dims);
const bool is_neox = mode & 2;
const int32_t * pos = (const int32_t *) src1->data;
for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = 0; i2 < ne2; i2++) {
const int64_t p = pos[i2];
for (int64_t i1 = 0; i1 < ne1; i1++) {
if (ir++ < ir0) continue;
if (ir > ir1) break;
float theta_base = freq_scale * (float)p;
if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float cos_theta = cosf(theta_base);
const float sin_theta = sinf(theta_base);
// zeta scaling for xPos only:
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
if (xpos_down) zeta = 1.0f / zeta;
theta_base *= theta_scale;
const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float dy0 = dy[0];
const float dy1 = dy[1];
dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta;
dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
}
} else {
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
for (int64_t ic = 0; ic < n_dims; ic += 2) {
const float cos_theta = cosf(theta_base);
const float sin_theta = sinf(theta_base);
theta_base *= theta_scale;
const int64_t i0 = ib*n_dims + ic/2;
const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float dy0 = dy[0];
const float dy1 = dy[n_dims/2];
dx[0] = dy0*cos_theta + dy1*sin_theta;
dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta;
}
}
}
}
}
}
}
static void ggml_compute_forward_rope_back_f16(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
// y = rope(x, src1)
// dx = rope_back(dy, src1)
// src0 is dy, src1 contains options
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2];
GGML_TENSOR_UNARY_OP_LOCALS
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
assert(nb0 == sizeof(ggml_fp16_t));
const int ith = params->ith;
const int nth = params->nth;
const int nr = ggml_nrows(dst);
// rows per thread
const int dr = (nr + nth - 1)/nth;
// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
// row index used to determine which thread to use
int ir = 0;
const float theta_scale = powf(10000.0, -2.0f/n_dims);
const bool is_neox = mode & 2;
const int32_t * pos = (const int32_t *) src1->data;
for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = 0; i2 < ne2; i2++) {
const int64_t p = pos[i2];
for (int64_t i1 = 0; i1 < ne1; i1++) {
if (ir++ < ir0) continue;
if (ir > ir1) break;
float theta_base = (float)p;
if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float cos_theta = cosf(theta_base);
const float sin_theta = sinf(theta_base);
theta_base *= theta_scale;
const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float dy0 = GGML_FP16_TO_FP32(dy[0]);
const float dy1 = GGML_FP16_TO_FP32(dy[1]);
dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
}
} else {
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
for (int64_t ic = 0; ic < n_dims; ic += 2) {
const float cos_theta = cosf(theta_base);
const float sin_theta = sinf(theta_base);
theta_base *= theta_scale;
const int64_t i0 = ib*n_dims + ic/2;
const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float dy0 = GGML_FP16_TO_FP32(dy[0]);
const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]);
dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
}
}
}
}
}
}
}
static void ggml_compute_forward_rope_back( static void ggml_compute_forward_rope_back(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
@ -11512,11 +11327,11 @@ static void ggml_compute_forward_rope_back(
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F16: case GGML_TYPE_F16:
{ {
ggml_compute_forward_rope_back_f16(params, src0, src1, dst); ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
} break; } break;
case GGML_TYPE_F32: case GGML_TYPE_F32:
{ {
ggml_compute_forward_rope_back_f32(params, src0, src1, dst); ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
} break; } break;
default: default:
{ {
@ -15559,17 +15374,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
// necessary for llama // necessary for llama
if (src0->grad) { if (src0->grad) {
//const int n_past = ((int32_t *) tensor->op_params)[0]; //const int n_past = ((int32_t *) tensor->op_params)[0];
const int n_dims = ((int32_t *) tensor->op_params)[1]; const int n_dims = ((int32_t *) tensor->op_params)[1];
const int mode = ((int32_t *) tensor->op_params)[2]; const int mode = ((int32_t *) tensor->op_params)[2];
const int n_ctx = ((int32_t *) tensor->op_params)[3]; const int n_ctx = ((int32_t *) tensor->op_params)[3];
float freq_base; const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
float freq_scale; float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
float xpos_base;
bool xpos_down; memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float)); memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float)); memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
src0->grad = ggml_add_or_set(ctx, src0->grad = ggml_add_or_set(ctx,
src0->grad, src0->grad,
@ -15579,8 +15397,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
n_dims, n_dims,
mode, mode,
n_ctx, n_ctx,
n_orig_ctx,
freq_base, freq_base,
freq_scale, freq_scale,
ext_factor,
attn_factor,
beta_fast,
beta_slow,
xpos_base, xpos_base,
xpos_down), xpos_down),
zero_table); zero_table);
@ -15590,17 +15413,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{ {
if (src0->grad) { if (src0->grad) {
//const int n_past = ((int32_t *) tensor->op_params)[0]; //const int n_past = ((int32_t *) tensor->op_params)[0];
const int n_dims = ((int32_t *) tensor->op_params)[1]; const int n_dims = ((int32_t *) tensor->op_params)[1];
const int mode = ((int32_t *) tensor->op_params)[2]; const int mode = ((int32_t *) tensor->op_params)[2];
const int n_ctx = ((int32_t *) tensor->op_params)[3]; const int n_ctx = ((int32_t *) tensor->op_params)[3];
float freq_base; const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
float freq_scale; float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
float xpos_base;
bool xpos_down; memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float)); memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float)); memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
src0->grad = ggml_add_or_set(ctx, src0->grad = ggml_add_or_set(ctx,
src0->grad, src0->grad,
@ -15609,14 +15435,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
src1, src1,
n_dims, n_dims,
mode, mode,
0,
n_ctx, n_ctx,
n_orig_ctx,
freq_base, freq_base,
freq_scale, freq_scale,
0.0f, ext_factor,
1.0f, attn_factor,
0.0f, beta_fast,
0.0f, beta_slow,
xpos_base, xpos_base,
xpos_down, xpos_down,
false), false),

5
ggml.h
View file

@ -1372,8 +1372,13 @@ extern "C" {
int n_dims, int n_dims,
int mode, int mode,
int n_ctx, int n_ctx,
int n_orig_ctx,
float freq_base, float freq_base,
float freq_scale, float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow,
float xpos_base, float xpos_base,
bool xpos_down); bool xpos_down);

192
llama.cpp
View file

@ -617,19 +617,37 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
// llama helpers // llama helpers
// //
inline void * llama_host_malloc(size_t n) {
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
# define llama_host_malloc(n) ggml_cuda_host_malloc(n) if (ggml_cublas_loaded()) {
# define llama_host_free(data) ggml_cuda_host_free(data) return ggml_cuda_host_malloc(n);
} else {
return malloc(n);
}
#elif GGML_USE_METAL #elif GGML_USE_METAL
# define llama_host_malloc(n) ggml_metal_host_malloc(n) return ggml_metal_host_malloc(n);
# define llama_host_free(data) ggml_metal_host_free(data)
#elif GGML_USE_CPU_HBM #elif GGML_USE_CPU_HBM
# define llama_host_malloc(n) hbw_malloc(n) return hbw_malloc(n);
# define llama_host_free(data) if (data != NULL) hbw_free(data)
#else #else
# define llama_host_malloc(n) malloc(n) return malloc(n);
# define llama_host_free(data) free(data)
#endif #endif
}
inline void llama_host_free(void * ptr) {
#ifdef GGML_USE_CUBLAS
if (ggml_cublas_loaded()) {
return ggml_cuda_host_free(ptr);
} else {
return free(ptr);
}
#elif GGML_USE_METAL
return ggml_metal_host_free(ptr);
#elif GGML_USE_CPU_HBM
return hbw_free(ptr);
#else
return free(ptr);
#endif
}
#if defined(_WIN32) #if defined(_WIN32)
static std::string llama_format_win_err(DWORD err) { static std::string llama_format_win_err(DWORD err) {
@ -1221,9 +1239,11 @@ struct llama_kv_cache {
} }
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
ggml_cuda_free_data(k); if (ggml_cublas_loaded()) {
ggml_cuda_free_data(v); ggml_cuda_free_data(k);
#endif // GGML_USE_CUBLAS ggml_cuda_free_data(v);
}
#endif
} }
}; };
@ -1323,11 +1343,15 @@ struct llama_model {
} }
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
for (size_t i = 0; i < tensors_by_name.size(); ++i) { if (ggml_cublas_loaded()) {
ggml_cuda_free_data(tensors_by_name[i].second); for (size_t i = 0; i < tensors_by_name.size(); ++i) {
ggml_cuda_free_data(tensors_by_name[i].second);
}
ggml_cuda_free_scratch();
} }
ggml_cuda_free_scratch(); #endif
#elif defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_CLBLAST)
for (size_t i = 0; i < tensors_by_name.size(); ++i) { for (size_t i = 0; i < tensors_by_name.size(); ++i) {
ggml_cl_free_data(tensors_by_name[i].second); ggml_cl_free_data(tensors_by_name[i].second);
} }
@ -1439,23 +1463,26 @@ static bool llama_kv_cache_init(
ggml_set_name(cache.v, "cache_v"); ggml_set_name(cache.v, "cache_v");
(void) n_gpu_layers; (void) n_gpu_layers;
#ifdef GGML_USE_CUBLAS
size_t vram_kv_cache = 0;
if (n_gpu_layers > (int)n_layer + 1) { #ifdef GGML_USE_CUBLAS
ggml_cuda_assign_buffers_no_scratch(cache.v); if (ggml_cublas_loaded()) {
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__); size_t vram_kv_cache = 0;
vram_kv_cache += ggml_nbytes(cache.v);
if (n_gpu_layers > (int)n_layer + 1) {
ggml_cuda_assign_buffers_no_scratch(cache.v);
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
vram_kv_cache += ggml_nbytes(cache.v);
}
if (n_gpu_layers > (int)n_layer + 2) {
ggml_cuda_assign_buffers_no_scratch(cache.k);
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
vram_kv_cache += ggml_nbytes(cache.k);
}
if (vram_kv_cache > 0) {
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
}
} }
if (n_gpu_layers > (int)n_layer + 2) { #endif
ggml_cuda_assign_buffers_no_scratch(cache.k);
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
vram_kv_cache += ggml_nbytes(cache.k);
}
if (vram_kv_cache > 0) {
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
}
#endif // GGML_USE_CUBLAS
return true; return true;
} }
@ -2552,18 +2579,22 @@ static void llm_load_tensors(
} }
(void) main_gpu; (void) main_gpu;
enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__); if (ggml_cublas_loaded()) {
ggml_cuda_set_main_device(main_gpu); LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU ggml_cuda_set_main_device(main_gpu);
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
llama_backend_offload = GGML_BACKEND_GPU;
llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
}
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__); LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU llama_backend_offload = GGML_BACKEND_GPU;
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU llama_backend_offload_split = GGML_BACKEND_GPU;
#else
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
#endif #endif
// prepare memory for the weights // prepare memory for the weights
@ -2590,12 +2621,12 @@ static void llm_load_tensors(
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU // on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32 #ifndef _WIN32
backend_norm = LLAMA_BACKEND_OFFLOAD; backend_norm = llama_backend_offload;
#else #else
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
#endif // _WIN32 #endif // _WIN32
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; backend_output = llama_backend_offload_split;
} else { } else {
backend_norm = GGML_BACKEND_CPU; backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU; backend_output = GGML_BACKEND_CPU;
@ -2619,8 +2650,8 @@ static void llm_load_tensors(
model.layers.resize(n_layer); model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) { for (uint32_t i = 0; i < n_layer; ++i) {
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
auto & layer = model.layers[i]; auto & layer = model.layers[i];
@ -2656,12 +2687,12 @@ static void llm_load_tensors(
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU // on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32 #ifndef _WIN32
backend_norm = LLAMA_BACKEND_OFFLOAD; backend_norm = llama_backend_offload;
#else #else
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
#endif // _WIN32 #endif // _WIN32
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; backend_output = llama_backend_offload_split;
} else { } else {
backend_norm = GGML_BACKEND_CPU; backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU; backend_output = GGML_BACKEND_CPU;
@ -2685,8 +2716,8 @@ static void llm_load_tensors(
model.layers.resize(n_layer); model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) { for (uint32_t i = 0; i < n_layer; ++i) {
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
auto & layer = model.layers[i]; auto & layer = model.layers[i];
@ -2726,12 +2757,12 @@ static void llm_load_tensors(
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU // on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32 #ifndef _WIN32
backend_norm = LLAMA_BACKEND_OFFLOAD; backend_norm = llama_backend_offload;
#else #else
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
#endif // _WIN32 #endif // _WIN32
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; backend_output = llama_backend_offload_split;
} else { } else {
backend_norm = GGML_BACKEND_CPU; backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU; backend_output = GGML_BACKEND_CPU;
@ -2757,8 +2788,8 @@ static void llm_load_tensors(
model.layers.resize(n_layer); model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) { for (uint32_t i = 0; i < n_layer; ++i) {
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
auto & layer = model.layers[i]; auto & layer = model.layers[i];
@ -2803,12 +2834,12 @@ static void llm_load_tensors(
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU // on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32 #ifndef _WIN32
backend_norm = LLAMA_BACKEND_OFFLOAD; backend_norm = llama_backend_offload;
#else #else
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
#endif // _WIN32 #endif // _WIN32
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; backend_output = llama_backend_offload_split;
} else { } else {
backend_norm = GGML_BACKEND_CPU; backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU; backend_output = GGML_BACKEND_CPU;
@ -2834,8 +2865,8 @@ static void llm_load_tensors(
model.layers.resize(n_layer); model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) { for (uint32_t i = 0; i < n_layer; ++i) {
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
auto & layer = model.layers[i]; auto & layer = model.layers[i];
@ -2880,12 +2911,12 @@ static void llm_load_tensors(
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU // on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32 #ifndef _WIN32
backend_norm = LLAMA_BACKEND_OFFLOAD; backend_norm = llama_backend_offload;
#else #else
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
#endif // _WIN32 #endif // _WIN32
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; backend_output = llama_backend_offload_split;
} else { } else {
backend_norm = GGML_BACKEND_CPU; backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU; backend_output = GGML_BACKEND_CPU;
@ -2908,8 +2939,8 @@ static void llm_load_tensors(
const int i_gpu_start = n_layer - n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers;
model.layers.resize(n_layer); model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) { for (uint32_t i = 0; i < n_layer; ++i) {
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
auto & layer = model.layers[i]; auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
@ -2946,12 +2977,12 @@ static void llm_load_tensors(
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU // on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32 #ifndef _WIN32
backend_norm = LLAMA_BACKEND_OFFLOAD; backend_norm = llama_backend_offload;
#else #else
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
#endif // _WIN32 #endif // _WIN32
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; backend_output = llama_backend_offload_split;
} else { } else {
backend_norm = GGML_BACKEND_CPU; backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU; backend_output = GGML_BACKEND_CPU;
@ -2977,8 +3008,8 @@ static void llm_load_tensors(
model.layers.resize(n_layer); model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) { for (uint32_t i = 0; i < n_layer; ++i) {
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
auto & layer = model.layers[i]; auto & layer = model.layers[i];
@ -3024,12 +3055,12 @@ static void llm_load_tensors(
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU // on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32 #ifndef _WIN32
backend_norm = LLAMA_BACKEND_OFFLOAD; backend_norm = llama_backend_offload;
#else #else
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
#endif // _WIN32 #endif // _WIN32
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; backend_output = llama_backend_offload_split;
} else { } else {
backend_norm = GGML_BACKEND_CPU; backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU; backend_output = GGML_BACKEND_CPU;
@ -3053,8 +3084,8 @@ static void llm_load_tensors(
model.layers.resize(n_layer); model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) { for (uint32_t i = 0; i < n_layer; ++i) {
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
auto & layer = model.layers[i]; auto & layer = model.layers[i];
@ -5397,11 +5428,12 @@ static int llama_decode_internal(
// If all tensors can be run on the GPU then using more than 1 thread is detrimental. // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
const bool full_offload_supported = const bool full_offload_supported =
model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_LLAMA ||
model.arch == LLM_ARCH_BAICHUAN || model.arch == LLM_ARCH_BAICHUAN ||
model.arch == LLM_ARCH_FALCON || model.arch == LLM_ARCH_FALCON ||
model.arch == LLM_ARCH_REFACT || model.arch == LLM_ARCH_REFACT ||
model.arch == LLM_ARCH_MPT || model.arch == LLM_ARCH_MPT ||
model.arch == LLM_ARCH_STARCODER ||
model.arch == LLM_ARCH_STABLELM; model.arch == LLM_ARCH_STABLELM;
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3; const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;