Merge branch 'master' into stablelm-support

2023-11-07 11:22:13 +01:00 · 2023-11-07 11:22:13 +01:00 · 698c94595e
commit 698c94595e
parent c95937642a e9c1cecb9d
24 changed files with 1437 additions and 777 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -288,6 +288,7 @@ jobs:
      OPENBLAS_VERSION: 0.3.23
      OPENCL_VERSION: 2023.04.17
      CLBLAST_VERSION: 1.6.0
      SDE_VERSION: 9.21.1-2023-04-24
    strategy:
      matrix:
@ -383,11 +384,23 @@ jobs:
      - name: Test
        id: cmake_test
-        if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
+        if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
        run: |
          cd build
          ctest -C Release --verbose --timeout 900
      - name: Test (Intel SDE)
        id: cmake_test_sde
        if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
        run: |
          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz"
          # for some weird reason windows tar doesn't like sde tar.xz
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
          $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
          cd build
          & $sde -future -- ctest -C Release --verbose --timeout 900
      - name: Determine tag name
        id: tag
        shell: bash
--- a/.gitignore
+++ b/.gitignore
@ -46,7 +46,7 @@ models-mnt
 /infill
 /libllama.so
 /llama-bench
-/llava
+/llava-cli
 /main
 /metal
 /perplexity
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -10,7 +10,7 @@ endif()
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(LLAMA_STANDALONE ON)
    # configure project version
@ -44,7 +44,7 @@ endif()
 # general
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
-option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      OFF)
+option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
 # debug
@ -510,6 +510,10 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
    message(STATUS "x86 detected")
    if (MSVC)
        # instruction set detection for MSVC only
        if (LLAMA_NATIVE)
            include(cmake/FindSIMD.cmake)
        endif ()
        if (LLAMA_AVX512)
            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
--- a/7
+++ b/7
@ -1,7 +1,7 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf llama-bench llava baby-llama beam-search  \
+	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
 	speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
 # Binaries only useful for tests
@ -617,7 +617,10 @@ convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggm
 llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ $(LDFLAGS) -Wno-cast-qual
 llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
 baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
--- a/cmake/FindSIMD.cmake
+++ b/cmake/FindSIMD.cmake
@ -0,0 +1,100 @@
 include(CheckCSourceRuns)
 set(AVX_CODE "
    #include <immintrin.h>
    int main()
    {
        __m256 a;
        a = _mm256_set1_ps(0);
        return 0;
    }
 ")
 set(AVX512_CODE "
    #include <immintrin.h>
    int main()
    {
        __m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0);
        __m512i b = a;
        __mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
        return 0;
    }
 ")
 set(AVX2_CODE "
    #include <immintrin.h>
    int main()
    {
        __m256i a = {0};
        a = _mm256_abs_epi16(a);
        __m256i x;
        _mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
        return 0;
    }
 ")
 set(FMA_CODE "
    #include <immintrin.h>
    int main()
    {
        __m256 acc = _mm256_setzero_ps();
        const __m256 d = _mm256_setzero_ps();
        const __m256 p = _mm256_setzero_ps();
        acc = _mm256_fmadd_ps( d, p, acc );
        return 0;
    }
 ")
 macro(check_sse type flags)
    set(__FLAG_I 1)
    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
    foreach (__FLAG ${flags})
        if (NOT ${type}_FOUND)
            set(CMAKE_REQUIRED_FLAGS ${__FLAG})
            check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I})
            if (HAS_${type}_${__FLAG_I})
                set(${type}_FOUND TRUE CACHE BOOL "${type} support")
                set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
            endif()
            math(EXPR __FLAG_I "${__FLAG_I}+1")
        endif()
    endforeach()
    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
    if (NOT ${type}_FOUND)
        set(${type}_FOUND FALSE CACHE BOOL "${type} support")
        set(${type}_FLAGS "" CACHE STRING "${type} flags")
    endif()
    mark_as_advanced(${type}_FOUND ${type}_FLAGS)
 endmacro()
 # flags are for MSVC only!
 check_sse("AVX" " ;/arch:AVX")
 if (NOT ${AVX_FOUND})
    set(LLAMA_AVX OFF)
 else()
    set(LLAMA_AVX ON)
 endif()
 check_sse("AVX2" " ;/arch:AVX2")
 check_sse("FMA" " ;/arch:AVX2")
 if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
    set(LLAMA_AVX2 OFF)
 else()
    set(LLAMA_AVX2 ON)
 endif()
 check_sse("AVX512" " ;/arch:AVX512")
 if (NOT ${AVX512_FOUND})
    set(LLAMA_AVX512 OFF)
 else()
    set(LLAMA_AVX512 ON)
 endif()
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -41,6 +41,7 @@ endif()
 set(TARGET common)
 add_library(${TARGET} STATIC
    base64.hpp
    common.h
    common.cpp
    sampling.h
--- a/common/base64.hpp
+++ b/common/base64.hpp
@ -0,0 +1,392 @@
 /*
 This is free and unencumbered software released into the public domain.
 Anyone is free to copy, modify, publish, use, compile, sell, or
 distribute this software, either in source code form or as a compiled
 binary, for any purpose, commercial or non-commercial, and by any
 means.
 In jurisdictions that recognize copyright laws, the author or authors
 of this software dedicate any and all copyright interest in the
 software to the public domain. We make this dedication for the benefit
 of the public at large and to the detriment of our heirs and
 successors. We intend this dedication to be an overt act of
 relinquishment in perpetuity of all present and future rights to this
 software under copyright law.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 OTHER DEALINGS IN THE SOFTWARE.
 For more information, please refer to <http://unlicense.org>
 */
 #ifndef PUBLIC_DOMAIN_BASE64_HPP_
 #define PUBLIC_DOMAIN_BASE64_HPP_
 #include <cstdint>
 #include <iterator>
 #include <stdexcept>
 #include <string>
 class base64_error : public std::runtime_error
 {
 public:
    using std::runtime_error::runtime_error;
 };
 class base64
 {
 public:
    enum class alphabet
    {
        /** the alphabet is detected automatically */
        auto_,
        /** the standard base64 alphabet is used */
        standard,
        /** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/
        url_filename_safe
    };
    enum class decoding_behavior
    {
        /** if the input is not padded, the remaining bits are ignored */
        moderate,
        /** if a padding character is encounter decoding is finished */
        loose
    };
    /**
     Encodes all the elements from `in_begin` to `in_end` to `out`.
     @warning The source and destination cannot overlap. The destination must be able to hold at least
     `required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator.
     @tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than
     8 bits
     @tparam Output_iterator the destination; the elements written to it are from the type `char`
     @param in_begin the beginning of the source
     @param in_end the ending of the source
     @param out the destination iterator
     @param alphabet which alphabet should be used
     @returns the iterator to the next element past the last element copied
     @throws see `Input_iterator` and `Output_iterator`
    */
    template<typename Input_iterator, typename Output_iterator>
    static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
                                  alphabet alphabet = alphabet::standard)
    {
        constexpr auto pad = '=';
        const char* alpha  = alphabet == alphabet::url_filename_safe
                                ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
                                : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
        while (in_begin != in_end) {
            std::uint8_t i0 = 0, i1 = 0, i2 = 0;
            // first character
            i0 = static_cast<std::uint8_t>(*in_begin);
            ++in_begin;
            *out = alpha[i0 >> 2 & 0x3f];
            ++out;
            // part of first character and second
            if (in_begin != in_end) {
                i1 = static_cast<std::uint8_t>(*in_begin);
                ++in_begin;
                *out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)];
                ++out;
            } else {
                *out = alpha[(i0 & 0x3) << 4];
                ++out;
                // last padding
                *out = pad;
                ++out;
                // last padding
                *out = pad;
                ++out;
                break;
            }
            // part of second character and third
            if (in_begin != in_end) {
                i2 = static_cast<std::uint8_t>(*in_begin);
                ++in_begin;
                *out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)];
                ++out;
            } else {
                *out = alpha[(i1 & 0xf) << 2];
                ++out;
                // last padding
                *out = pad;
                ++out;
                break;
            }
            // rest of third
            *out = alpha[i2 & 0x3f];
            ++out;
        }
        return out;
    }
    /**
     Encodes a string.
     @param str the string that should be encoded
     @param alphabet which alphabet should be used
     @returns the encoded base64 string
     @throws see base64::encode()
    */
    static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard)
    {
        std::string result;
        result.reserve(required_encode_size(str.length()) + 1);
        encode(str.begin(), str.end(), std::back_inserter(result), alphabet);
        return result;
    }
    /**
     Encodes a char array.
     @param buffer the char array
     @param size the size of the array
     @param alphabet which alphabet should be used
     @returns the encoded string
    */
    static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard)
    {
        std::string result;
        result.reserve(required_encode_size(size) + 1);
        encode(buffer, buffer + size, std::back_inserter(result), alphabet);
        return result;
    }
    /**
     Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`,
     in other words: inplace decoding is possible.
     @warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`,
     otherwise the behavior depends on the output iterator.
     @tparam Input_iterator the source; the returned elements are cast to `char`
     @tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t`
     @param in_begin the beginning of the source
     @param in_end the ending of the source
     @param out the destination iterator
     @param alphabet which alphabet should be used
     @param behavior the behavior when an error was detected
     @returns the iterator to the next element past the last element copied
     @throws base64_error depending on the set behavior
     @throws see `Input_iterator` and `Output_iterator`
    */
    template<typename Input_iterator, typename Output_iterator>
    static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
                                  alphabet alphabet          = alphabet::auto_,
                                  decoding_behavior behavior = decoding_behavior::moderate)
    {
        //constexpr auto pad = '=';
        std::uint8_t last  = 0;
        auto bits          = 0;
        while (in_begin != in_end) {
            auto c = *in_begin;
            ++in_begin;
            if (c == '=') {
                break;
            }
            auto part = _base64_value(alphabet, c);
            // enough bits for one byte
            if (bits + 6 >= 8) {
                *out = (last << (8 - bits)) | (part >> (bits - 2));
                ++out;
                bits -= 2;
            } else {
                bits += 6;
            }
            last = part;
        }
        // check padding
        if (behavior != decoding_behavior::loose) {
            while (in_begin != in_end) {
                auto c = *in_begin;
                ++in_begin;
                if (c != '=') {
                    throw base64_error("invalid base64 character.");
                }
            }
        }
        return out;
    }
    /**
     Decodes a string.
     @param str the base64 encoded string
     @param alphabet which alphabet should be used
     @param behavior the behavior when an error was detected
     @returns the decoded string
     @throws see base64::decode()
    */
    static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_,
                              decoding_behavior behavior = decoding_behavior::moderate)
    {
        std::string result;
        result.reserve(max_decode_size(str.length()));
        decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior);
        return result;
    }
    /**
     Decodes a string.
     @param buffer the base64 encoded buffer
     @param size the size of the buffer
     @param alphabet which alphabet should be used
     @param behavior the behavior when an error was detected
     @returns the decoded string
     @throws see base64::decode()
    */
    static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_,
                              decoding_behavior behavior = decoding_behavior::moderate)
    {
        std::string result;
        result.reserve(max_decode_size(size));
        decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior);
        return result;
    }
    /**
     Decodes a string inplace.
     @param[in,out] str the base64 encoded string
     @param alphabet which alphabet should be used
     @param behavior the behavior when an error was detected
     @throws base64::decode_inplace()
    */
    static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_,
                               decoding_behavior behavior = decoding_behavior::moderate)
    {
        str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin());
    }
    /**
     Decodes a char array inplace.
     @param[in,out] str the string array
     @param size the length of the array
     @param alphabet which alphabet should be used
     @param behavior the behavior when an error was detected
     @returns the pointer to the next element past the last element decoded
     @throws base64::decode_inplace()
    */
    static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_,
                                decoding_behavior behavior = decoding_behavior::moderate)
    {
        return decode(str, str + size, str, alphabet, behavior);
    }
    /**
     Returns the required decoding size for a given size. The value is calculated with the following formula:
     $$
     \lceil \frac{size}{4} \rceil \cdot 3
     $$
     @param size the size of the encoded input
     @returns the size of the resulting decoded buffer; this the absolute maximum
    */
    static std::size_t max_decode_size(std::size_t size) noexcept
    {
        return (size / 4 + (size % 4 ? 1 : 0)) * 3;
    }
    /**
     Returns the required encoding size for a given size. The value is calculated with the following formula:
     $$
     \lceil \frac{size}{3} \rceil \cdot 4
     $$
     @param size the size of the decoded input
     @returns the size of the resulting encoded buffer
    */
    static std::size_t required_encode_size(std::size_t size) noexcept
    {
        return (size / 3 + (size % 3 ? 1 : 0)) * 4;
    }
 private:
    static std::uint8_t _base64_value(alphabet& alphabet, char c)
    {
        if (c >= 'A' && c <= 'Z') {
            return c - 'A';
        } else if (c >= 'a' && c <= 'z') {
            return c - 'a' + 26;
        } else if (c >= '0' && c <= '9') {
            return c - '0' + 52;
        }
        // comes down to alphabet
        if (alphabet == alphabet::standard) {
            if (c == '+') {
                return 62;
            } else if (c == '/') {
                return 63;
            }
        } else if (alphabet == alphabet::url_filename_safe) {
            if (c == '-') {
                return 62;
            } else if (c == '_') {
                return 63;
            }
        } // auto detect
        else {
            if (c == '+') {
                alphabet = alphabet::standard;
                return 62;
            } else if (c == '/') {
                alphabet = alphabet::standard;
                return 63;
            } else if (c == '-') {
                alphabet = alphabet::url_filename_safe;
                return 62;
            } else if (c == '_') {
                alphabet = alphabet::url_filename_safe;
                return 63;
            }
        }
        throw base64_error("invalid base64 character.");
    }
 };
 #endif // !PUBLIC_DOMAIN_BASE64_HPP_
--- a/common/common.cpp
+++ b/common/common.cpp
@ -90,6 +90,19 @@ void process_escapes(std::string& input) {
                case '\'': input[output_idx++] = '\''; break;
                case '\"': input[output_idx++] = '\"'; break;
                case '\\': input[output_idx++] = '\\'; break;
                case 'x':
                    // Handle \x12, etc
                    if (input_idx + 2 < input_len) {
                        const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
                        char *err_p = nullptr;
                        const long val = std::strtol(x, &err_p, 16);
                        if (err_p == x + 2) {
                            input_idx += 2;
                            input[output_idx++] = char(val);
                            break;
                        }
                    }
                    // fall through
                default:   input[output_idx++] = '\\';
                           input[output_idx++] = input[input_idx]; break;
            }
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -643,7 +643,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        return ggml_rope_custom(ctx,
            t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
-            rope_freq_base, rope_freq_scale, 0.0f, 0.0f, 0.0f, 0.0f
+            rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
        );
    };
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@ -1,14 +1,36 @@
-set(TARGET clip)
+add_library(llava OBJECT
-add_library(${TARGET} clip.cpp clip.h)
+            llava.cpp
-install(TARGETS ${TARGET} LIBRARY)
+            llava.h
-target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
+            clip.cpp
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+            clip.h
-if (NOT MSVC)
+            )
-    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
+
 target_link_libraries(llava PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(llava PUBLIC .)
 target_include_directories(llava PUBLIC ../..)
 target_include_directories(llava PUBLIC ../../common)
 target_compile_features(llava PRIVATE cxx_std_11)
 add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
 if (BUILD_SHARED_LIBS)
    set_target_properties(llava PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_compile_definitions(llava PRIVATE LLAMA_SHARED LLAMA_BUILD)
    add_library(llava_shared SHARED $<TARGET_OBJECTS:llava>)
    target_link_libraries(llava_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
    install(TARGETS llava_shared LIBRARY)
 endif()
-set(TARGET llava)
+if (NOT MSVC)
-add_executable(${TARGET} llava.cpp)
+    target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
-install(TARGETS ${TARGET} RUNTIME)
+    endif()
-target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
+if(TARGET BUILD_INFO)
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+    add_dependencies(llava BUILD_INFO)
 endif()
 set(TARGET llava-cli)
 add_executable(llava-cli llava-cli.cpp)
 install(TARGETS llava-cli RUNTIME)
 target_link_libraries(llava-cli PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(llava PRIVATE cxx_std_11)
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@ -9,12 +9,12 @@ models are available.
 After API is confirmed, more models will be supported / uploaded.
 ## Usage
-Build with cmake or run `make llava` to build it.
+Build with cmake or run `make llava-cli` to build it.
-After building, run: `./llava` to see the usage. For example:
+After building, run: `./llava-cli` to see the usage. For example:
 ```sh
-./llava -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
+./llava-cli -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
 ```
 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
@ -51,7 +51,6 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director
 ## TODO
 - [ ] Support server mode.
 - [ ] Support non-CPU backend for the image encoding part.
 - [ ] Support different sampling methods.
 - [ ] Support more model variants.
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -680,26 +680,44 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    return new_clip;
 }
-clip_image_u8 * make_clip_image_u8() { return new clip_image_u8(); }
+clip_image_u8 * make_clip_image_u8() {
-
+    auto img = new clip_image_u8();
    return img;
 }
 clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); }
-bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
+void clip_image_u8_free(clip_image_u8 * img) { if (img->data) { delete[] img->data; } delete img; }
-    int nx, ny, nc;
+void clip_image_f32_free(clip_image_f32 * img) { if (img->data) { delete[] img->data; } delete img; }
    auto data = stbi_load(fname, &nx, &ny, &nc, 3);
    if (!data) {
        fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname);
        return false;
    }
 static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
    img->nx = nx;
    img->ny = ny;
    img->size = nx * ny * 3;
    img->data = new uint8_t[img->size]();
    memcpy(img->data, data, img->size);
 }
 bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
    int nx, ny, nc;
    auto data = stbi_load(fname, &nx, &ny, &nc, 3);
    if (!data) {
        fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
        return false;
    }
    build_clip_img_from_data(data, nx, ny, img);
    stbi_image_free(data);
    return true;
 }
 bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
    int nx, ny, nc;
    auto data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
    if (!data) {
        fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
        return false;
    }
    build_clip_img_from_data(data, nx, ny, img);
    stbi_image_free(data);
    return true;
 }
@ -714,39 +732,40 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
    // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
    // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
-    clip_image_u8 temp; // we will keep the input image data here temporarily
+    clip_image_u8 * temp = make_clip_image_u8(); // we will keep the input image data here temporarily
    if (pad2square && img->nx != img->ny) {
        int longer_side = std::max(img->nx, img->ny);
-        temp.nx = longer_side;
+        temp->nx = longer_side;
-        temp.ny = longer_side;
+        temp->ny = longer_side;
-        temp.size = 3 * longer_side * longer_side;
+        temp->size = 3 * longer_side * longer_side;
-        temp.data = new uint8_t[temp.size]();
+        temp->data = new uint8_t[temp->size]();
        uint8_t bc[3] = {122, 116, 104}; // bakground color in RGB from LLaVA
        // fill with background color
-        for (size_t i = 0; i < temp.size; i++) {
+        for (size_t i = 0; i < temp->size; i++) {
-            temp.data[i] = bc[i % 3];
+            temp->data[i] = bc[i % 3];
        }
        // copy from the input image
        for (int y = 0; y < img->ny; y++) {
            for (int x = 0; x < img->nx; x++) {
                const int i = 3 * (y * img->nx + x);
-                const int j = 3 * (y * temp.nx + x);
+                const int j = 3 * (y * temp->nx + x);
-                temp.data[j] = img->data[i];
+                temp->data[j] = img->data[i];
-                temp.data[j+1] = img->data[i+1];
+                temp->data[j+1] = img->data[i+1];
-                temp.data[j+2] = img->data[i+2];
+                temp->data[j+2] = img->data[i+2];
            }
        }
    } else {
-        temp.nx   = img->nx;
+        temp->nx   = img->nx;
-        temp.ny   = img->ny;
+        temp->ny   = img->ny;
-        temp.size = img->size;
+        temp->size = img->size;
-        temp.data = img->data;
+        temp->data = new uint8_t[temp->size]();
        *temp->data = *img->data; // copy
    }
-    const int nx = temp.nx;
+    const int nx = temp->nx;
-    const int ny = temp.ny;
+    const int ny = temp->ny;
    const int nx2 = ctx->vision_model.hparams.image_size;
    const int ny2 = ctx->vision_model.hparams.image_size;
@ -785,10 +804,10 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
                const int j10 = 3 * (y1 * nx + x0) + c;
                const int j11 = 3 * (y1 * nx + x1) + c;
-                const float v00 = temp.data[j00];
+                const float v00 = temp->data[j00];
-                const float v01 = temp.data[j01];
+                const float v01 = temp->data[j01];
-                const float v10 = temp.data[j10];
+                const float v10 = temp->data[j10];
-                const float v11 = temp.data[j11];
+                const float v11 = temp->data[j11];
                const float v0 = v00 * (1.0f - dx) + v01 * dx;
                const float v1 = v10 * (1.0f - dx) + v11 * dx;
@ -803,6 +822,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
            }
        }
    }
    clip_image_u8_free(temp);
    return true;
 }
@ -1049,16 +1069,16 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
    return true;
 }
-int clip_n_mmproj_embd(struct clip_ctx * ctx) {
+int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
    return ctx->vision_model.mm_2_b->ne[0];
 }
-int clip_n_patches(struct clip_ctx * ctx) {
+int clip_n_patches(const struct clip_ctx * ctx) {
    auto & params = ctx->vision_model.hparams;
    return (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
 }
-size_t clip_embd_nbytes(struct clip_ctx * ctx) {
+size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
    return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@ -1,7 +1,22 @@
 #ifndef CLIP_H
 #define CLIP_H
-#include "ggml.h"
+#include <stddef.h>
 #include <stdint.h>
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef LLAMA_BUILD
 #            define CLIP_API __declspec(dllexport)
 #        else
 #            define CLIP_API __declspec(dllimport)
 #        endif
 #    else
 #        define CLIP_API __attribute__ ((visibility ("default")))
 #    endif
 #else
 #    define CLIP_API
 #endif
 struct clip_ctx;
@ -20,19 +35,20 @@ struct clip_vision_hparams {
    float eps;
 };
-struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
+/** load mmproj model */
 CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
 /** free mmproj model */
 CLIP_API void clip_free(struct clip_ctx * ctx);
-void clip_free(struct clip_ctx * ctx);
+size_t clip_embd_nbytes(const struct clip_ctx * ctx);
-
+int clip_n_patches(const struct clip_ctx * ctx);
-size_t clip_embd_nbytes(struct clip_ctx * ctx);
+int clip_n_mmproj_embd(const struct clip_ctx * ctx);
 int clip_n_patches(struct clip_ctx * ctx);
 int clip_n_mmproj_embd(struct clip_ctx * ctx);
 // RGB uint8 image
 struct clip_image_u8 {
    int nx;
    int ny;
-    uint8_t * data;
+    uint8_t * data = NULL;
    size_t size;
 };
@ -41,7 +57,7 @@ struct clip_image_u8 {
 struct clip_image_f32 {
    int nx;
    int ny;
-    float * data;
+    float * data = NULL;
    size_t size;
 };
@ -57,7 +73,12 @@ struct clip_image_f32_batch {
 struct clip_image_u8 * make_clip_image_u8();
 struct clip_image_f32 * make_clip_image_f32();
-bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
+CLIP_API void clip_image_u8_free(clip_image_u8 * img);
 CLIP_API void clip_image_f32_free(clip_image_f32 * img);
 CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
 /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
 CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
 bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
 bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -0,0 +1,313 @@
 #include "ggml.h"
 #include "common.h"
 #include "clip.h"
 #include "llava.h"
 #include "llama.h"
 #include "base64.hpp"
 #include <cstdio>
 #include <cstdlib>
 #include <vector>
 static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
    int N = (int) tokens.size();
    for (int i = 0; i < N; i += n_batch) {
        int n_eval = (int) tokens.size() - i;
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
            fprintf(stderr, "%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
            return false;
        }
        *n_past += n_eval;
    }
    return true;
 }
 static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
    std::vector<llama_token> tokens;
    tokens.push_back(id);
    return eval_tokens(ctx_llama, tokens, 1, n_past);
 }
 static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
    std::string              str2     = str;
    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
    return true;
 }
 // TODO: use common/sampling.h
 static llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
    auto & sparams = params.sparams;
    // out of user input, sample next token
    const float   temp      = sparams.temp;
    const int32_t top_k     = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
    const float   top_p     = sparams.top_p;
    const float   tfs_z     = sparams.tfs_z;
    const float   typical_p = sparams.typical_p;
    // const int32_t repeat_last_n   = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
    // const float   repeat_penalty  = sparams.repeat_penalty;
    // const float   alpha_presence  = sparams.presence_penalty;
    // const float   alpha_frequency = sparams.frequency_penalty;
    const int     mirostat     = sparams.mirostat;
    const float   mirostat_tau = sparams.mirostat_tau;
    const float   mirostat_eta = sparams.mirostat_eta;
    // const bool    penalize_nl     = sparams.penalize_nl;
    llama_token id = 0;
    {
        auto logits  = llama_get_logits(ctx_llama);
        auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
        // Apply params.logit_bias map
        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
            logits[it->first] += it->second;
        }
        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        if (temp <= 0) {
              // Greedy sampling
            id = llama_sample_token_greedy(ctx_llama, &candidates_p);
        } else {
            if (mirostat == 1) {
                static float mirostat_mu = 2.0f * mirostat_tau;
                const  int mirostat_m    = 100;
                llama_sample_temp(ctx_llama, &candidates_p, temp);
                id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
            } else if (mirostat == 2) {
                static float mirostat_mu = 2.0f * mirostat_tau;
                llama_sample_temp(ctx_llama, &candidates_p, temp);
                id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
            } else {
                  // Temperature sampling
                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
                llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1);
                llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1);
                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
                llama_sample_temp(ctx_llama, &candidates_p, temp);
                id = llama_sample_token(ctx_llama, &candidates_p);
            }
        }
    }
    return id;
 }
 static const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
    int id = sample_id(ctx_llama, params);
    static std::string ret;
    if (id == llama_token_eos(llama_get_model(ctx_llama))) {
        ret = "</s>";
    } else {
        ret = llama_token_to_piece(ctx_llama, id);
    }
    eval_id(ctx_llama, id, n_past);
    return ret.c_str();
 }
 static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
 static const char* IMG_BASE64_TAG_END = "\">";
 static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
    begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
    end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
 }
 static bool prompt_contains_image(const std::string& prompt) {
    size_t begin, end;
    find_image_tag_in_prompt(prompt, begin, end);
    return (begin != std::string::npos);
 }
 // replaces the base64 image tag in the prompt with `replacement`
 static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
    size_t img_base64_str_start, img_base64_str_end;
    find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
    if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
        fprintf(stderr, "%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
        return NULL;
    }
    auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
    auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
    auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
    auto required_bytes = base64::required_encode_size(base64_str.size());
    auto img_bytes = std::vector<unsigned char>(required_bytes);
    base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
    if (!embed) {
        fprintf(stderr, "%s: could not load image from base64 string.\n", __func__);
        return NULL;
    }
    return embed;
 }
 static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
    size_t begin, end;
    find_image_tag_in_prompt(prompt, begin, end);
    if (begin == std::string::npos || end == std::string::npos) {
        return prompt;
    }
    auto pre = prompt.substr(0, begin);
    auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
    return pre + replacement + post;
 }
 struct llava_context {
    struct clip_ctx * ctx_clip = NULL;
    struct llama_context * ctx_llama = NULL;
    struct llama_model * model = NULL;
 };
 static void show_additional_info(int /*argc*/, char ** argv) {
    printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
    printf("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }
 static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {
    // load and preprocess the image
    llava_image_embed * embed = NULL;
    auto prompt = params->prompt;
    if (prompt_contains_image(prompt)) {
        if (!params->image.empty()) {
            printf("using base64 encoded image instead of command line image path\n");
        }
        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
        if (!embed) {
            fprintf(stderr, "%s: can't load image from prompt\n", __func__);
            return NULL;
        }
        params->prompt = remove_image_from_prompt(prompt);
    } else {
        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str());
        if (!embed) {
            fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str());
            return NULL;
        }
    }
    return embed;
 }
 static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
    int n_past = 0;
    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
    // llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
    eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, true);
    llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
    eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
    // generate the response
    printf("\n");
    for (int i = 0; i < max_tgt_len; i++) {
        const char * tmp = sample(ctx_llava->ctx_llama, *params, &n_past);
        if (strcmp(tmp, "</s>") == 0) break;
        printf("%s", tmp);
        fflush(stdout);
    }
    printf("\n");
 }
 static struct llava_context * llava_init(gpt_params * params) {
    const char * clip_path = params->mmproj.c_str();
    auto prompt = params->prompt;
    if (prompt.empty()) {
        prompt = "describe the image in detail.";
    }
    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
    llama_backend_init(params->numa);
    llama_model_params model_params = llama_model_params_from_gpt_params(*params);
    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return NULL;
    }
    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
    if (ctx_llama == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return NULL;
    }
    auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
    ctx_llava->ctx_llama = ctx_llama;
    ctx_llava->ctx_clip = ctx_clip;
    ctx_llava->model = model;
    return ctx_llava;
 }
 static void llava_free(struct llava_context * ctx_llava) {
    if (ctx_llava->ctx_clip) {
        clip_free(ctx_llava->ctx_clip);
        ctx_llava->ctx_clip = NULL;
    }
    llama_free(ctx_llava->ctx_llama);
    llama_free_model(ctx_llava->model);
    llama_backend_free();
 }
 int main(int argc, char ** argv) {
    ggml_time_init();
    gpt_params params;
    if (!gpt_params_parse(argc, argv, params)) {
        show_additional_info(argc, argv);
        return 1;
    }
    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
        gpt_print_usage(argc, argv, params);
        show_additional_info(argc, argv);
        return 1;
    }
    auto ctx_llava = llava_init(&params);
    if (ctx_llava == NULL) {
        fprintf(stderr, "%s: error: failed to init llava\n", __func__);
        return 1;
    }
    auto image_embed = load_image(ctx_llava, &params);
    // process the prompt
    process_prompt(ctx_llava, image_embed, &params, params.prompt);
    llama_print_timings(ctx_llava->ctx_llama);
    llava_image_embed_free(image_embed);
    llava_free(ctx_llava);
    return 0;
 }
--- a/examples/llava/llava-utils.h
+++ b/examples/llava/llava-utils.h
@ -1,147 +0,0 @@
 #pragma once
 // this one and clip lib will be eventually merged to a single lib, let's keep it this way for now
 #include "common.h"
 #include "llama.h"
 #include <cstdio>
 #include <cstdlib>
 #include <vector>
 inline bool eval_image_embd(llama_context * ctx_llama, float * embd, int N, int n_batch, int * n_past) {
    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
    for (int i = 0; i < N; i += n_batch) {
        int n_eval = N - i;
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
        llama_batch batch = {int32_t(n_eval), nullptr, (embd+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
        if (llama_decode(ctx_llama, batch)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
        }
        *n_past += n_eval;
    }
    return true;
 }
 inline bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
    int N = (int) tokens.size();
    for (int i = 0; i < N; i += n_batch) {
        int n_eval = (int) tokens.size() - i;
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
        }
        *n_past += n_eval;
    }
    return true;
 }
 inline bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
    std::vector<llama_token> tokens;
    tokens.push_back(id);
    return eval_tokens(ctx_llama, tokens, 1, n_past);
 }
 inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
    std::string              str2     = str;
    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
    return true;
 }
 // TODO: use common/sampling.h
 inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
    auto & sparams = params.sparams;
    // out of user input, sample next token
    const float   temp      = sparams.temp;
    const int32_t top_k     = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
    const float   top_p     = sparams.top_p;
    const float   tfs_z     = sparams.tfs_z;
    const float   typical_p = sparams.typical_p;
    // const int32_t repeat_last_n   = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
    // const float   repeat_penalty  = sparams.repeat_penalty;
    // const float   alpha_presence  = sparams.presence_penalty;
    // const float   alpha_frequency = sparams.frequency_penalty;
    const int     mirostat     = sparams.mirostat;
    const float   mirostat_tau = sparams.mirostat_tau;
    const float   mirostat_eta = sparams.mirostat_eta;
    // const bool    penalize_nl     = sparams.penalize_nl;
    llama_token id = 0;
    {
        auto logits  = llama_get_logits(ctx_llama);
        auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
        // Apply params.logit_bias map
        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
            logits[it->first] += it->second;
        }
        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        // TODO: Apply penalties
        // float nl_logit = logits[llama_token_nl(ctx)];
        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
        // llama_sample_repetition_penalty(ctx, &candidates_p,
        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
        //      last_n_repeat, repeat_penalty);
        // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
        // last_n_repeat, alpha_frequency, alpha_presence);
        // if (!penalize_nl) {
        //     logits[llama_token_nl(ctx)] = nl_logit;
        // }
        if (temp <= 0) {
              // Greedy sampling
            id = llama_sample_token_greedy(ctx_llama, &candidates_p);
        } else {
            if (mirostat == 1) {
                static float mirostat_mu = 2.0f * mirostat_tau;
                const  int mirostat_m    = 100;
                llama_sample_temp(ctx_llama, &candidates_p, temp);
                id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
            } else if (mirostat == 2) {
                static float mirostat_mu = 2.0f * mirostat_tau;
                llama_sample_temp(ctx_llama, &candidates_p, temp);
                id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
            } else {
                  // Temperature sampling
                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
                llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1);
                llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1);
                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
                llama_sample_temp(ctx_llama, &candidates_p, temp);
                id = llama_sample_token(ctx_llama, &candidates_p);
            }
        }
    }
    return id;
 }
 inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
    int id = sample_id(ctx_llama, params);
    static std::string ret;
    if (id == llama_token_eos(llama_get_model(ctx_llama))) {
        ret = "</s>";
    } else {
        ret = llama_token_to_piece(ctx_llama, id);
    }
    eval_id(ctx_llama, id, n_past);
    return ret.c_str();
 }
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -1,164 +1,156 @@
 #include "clip.h"
 #include "llava-utils.h"
 #include "common.h"
 #include "llama.h"
 #include "llava.h"
 #include <cstdio>
 #include <cstdlib>
 #include <vector>
-static void show_additional_info(int /*argc*/, char ** argv) {
+#include "base64.hpp"
    printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
    printf("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }
-int main(int argc, char ** argv) {
+static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
-    ggml_time_init();
+    clip_image_f32 * img_res = make_clip_image_f32();
-
+    if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) {
-    gpt_params params;
+        fprintf(stderr, "%s: unable to preprocess image\n", __func__);
-
+        clip_image_f32_free(img_res);
-    if (!gpt_params_parse(argc, argv, params)) {
+        return false;
        show_additional_info(argc, argv);
        return 1;
    }
-    if (params.mmproj.empty() || params.image.empty()) {
+    *n_img_pos = clip_n_patches(ctx_clip);
        gpt_print_usage(argc, argv, params);
        show_additional_info(argc, argv);
        return 1;
    }
    const char * clip_path = params.mmproj.c_str();
    const char * img_path = params.image.c_str();
    if (params.prompt.empty()) {
        params.prompt = "describe the image in detail.";
    }
    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
    // load and preprocess the image
    clip_image_u8 img;
    clip_image_f32 img_res;
    if (!clip_image_load_from_file(img_path, &img)) {
        fprintf(stderr, "%s: is %s really an image file?\n", __func__, img_path);
        clip_free(ctx_clip);
        return 1;
    }
    if (!clip_image_preprocess(ctx_clip, &img, &img_res, /*pad2square =*/ true)) {
        fprintf(stderr, "%s: unable to preprocess %s\n", __func__, img_path);
        clip_free(ctx_clip);
        return 1;
    }
    int n_img_pos  = clip_n_patches(ctx_clip);
    int n_img_embd = clip_n_mmproj_embd(ctx_clip);
    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
    if (!image_embd) {
        fprintf(stderr, "Unable to allocate memory for image embeddings\n");
        return 1;
    }
    const int64_t t_img_enc_start_us = ggml_time_us();
-    if (!clip_image_encode(ctx_clip, params.n_threads, &img_res, image_embd)) {
+    bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
    clip_image_f32_free(img_res);
    if (!encoded) {
        fprintf(stderr, "Unable to encode image\n");
-        return 1;
+        return false;
    }
    const int64_t t_img_enc_end_us = ggml_time_us();
    float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
-    // we get the embeddings, free up the memory required for CLIP
+    printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
    clip_free(ctx_clip);
-    llama_backend_init(params.numa);
+    return true;
-
+}
-    llama_model_params model_params              = llama_model_default_params();
+
-                       model_params.n_gpu_layers = params.n_gpu_layers;
+bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
-                       model_params.main_gpu     = params.main_gpu;
+        // make sure that the correct mmproj was used, i.e., compare apples to apples
-                       model_params.tensor_split = params.tensor_split;
+    int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
-                       model_params.use_mmap     = params.use_mmap;
+    auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
-                       model_params.use_mlock    = params.use_mlock;
+    if (n_image_embd != n_llama_embd) {
-
+        printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+        return false;
-    if (model == NULL) {
+    }
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+    return true;
-        return 1;
+}
-    }
+
-
+static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
-    llama_context_params ctx_params = llama_context_default_params();
+    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
-
+    if (!image_embd) {
-    ctx_params.n_ctx           = params.n_ctx < 2048 ? 2048 : params.n_ctx; // we need a longer context size to process image embeddings
+        fprintf(stderr, "Unable to allocate memory for image embeddings\n");
-    ctx_params.n_threads       = params.n_threads;
+        free(image_embd);
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+        return false;
-    ctx_params.seed            = params.seed;
+    }
-
+
-    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
+    int n_img_pos;
-
+    if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
-    if (ctx_llama == NULL) {
+        fprintf(stderr, "%s: cannot encode image, aborting\n", __func__);
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        free(image_embd);
-        return 1;
+        return false;
-    }
+    }
-
+    *image_embd_out = image_embd;
-    // make sure that the correct mmproj was used, i.e., compare apples to apples
+    *n_img_pos_out = n_img_pos;
-    const int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
+
-
+    return true;
-    if (n_img_embd != n_llama_embd) {
+}
-        printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_img_embd, n_llama_embd);
+
-
+bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
-        llama_free(ctx_llama);
+    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
-        llama_free_model(model);
+
-        llama_backend_free();
+    for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
-        free(image_embd);
+        int n_eval = image_embed->n_image_pos - i;
-
+        if (n_eval > n_batch) {
-        return 1;
+            n_eval = n_batch;
-    }
+        }
-
+        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
-    // process the prompt
+        if (llama_decode(ctx_llama, batch)) {
-    // llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:"
+            fprintf(stderr, "%s : failed to eval\n", __func__);
-
+            return false;
-    int n_past = 0;
+        }
-
+        *n_past += n_eval;
-    const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
+    }
-
+    return true;
-    eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params.n_batch, &n_past, true);
+}
-    eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past);
+
-    eval_string(ctx_llama, (params.prompt + "\nASSISTANT:").c_str(), params.n_batch, &n_past, false);
+LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
-
+    clip_image_u8 * img = make_clip_image_u8();
-    // generate the response
+    if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
-
+        clip_image_u8_free(img);
-    printf("\n");
+        fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);
-    printf("prompt: '%s'\n", params.prompt.c_str());
+        return NULL;
-    printf("\n");
+    }
-
+
-    for (int i = 0; i < max_tgt_len; i++) {
+    float* image_embed = NULL;
-        const char * tmp = sample(ctx_llama, params, &n_past);
+    int n_image_pos = 0;
-        if (strcmp(tmp, "</s>") == 0) break;
+    bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
-
+    if (!image_embed_result) {
-        printf("%s", tmp);
+        clip_image_u8_free(img);
-        fflush(stdout);
+        fprintf(stderr, "%s: coulnd't embed the image\n", __func__);
-    }
+        return NULL;
-
+    }
-    printf("\n");
+
-
+    clip_image_u8_free(img);
-    {
+    auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed));
-        const float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
+    result->embed = image_embed;
-
+    result->n_image_pos = n_image_pos;
-        printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / n_img_pos);
+    return result;
-    }
+}
-
+
-    llama_print_timings(ctx_llama);
+static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
-
+    auto file = fopen(path, "rb");
-    llama_free(ctx_llama);
+    if (file == NULL) {
-    llama_free_model(model);
+        fprintf(stderr, "%s: can't read file %s\n", __func__, path);
-    llama_backend_free();
+        return false;
-    free(image_embd);
+    }
-
+
-    return 0;
+    fseek(file, 0, SEEK_END);
    auto fileSize = ftell(file);
    fseek(file, 0, SEEK_SET);
    auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
    if (buffer == NULL) {
        fprintf(stderr, "%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
        perror("Memory allocation error");
        fclose(file);
        return false;
    }
    fread(buffer, 1, fileSize, file); // Read the file into the buffer
    fclose(file); // Close the file
    *bytesOut = buffer;
    *sizeOut = fileSize;
    return true;
 }
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
    unsigned char* image_bytes;
    long image_bytes_length;
    auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
    if (!loaded) {
        fprintf(stderr, "%s: failed to load %s\n", __func__, image_path);
        return NULL;
    }
    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
    free(image_bytes);
    return embed;
 }
 LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed) {
    free(embed->embed);
    free(embed);
 }
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@ -0,0 +1,50 @@
 #ifndef LLAVA_H
 #define LLAVA_H
 #include "ggml.h"
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef LLAMA_BUILD
 #            define LLAVA_API __declspec(dllexport)
 #        else
 #            define LLAVA_API __declspec(dllimport)
 #        endif
 #    else
 #        define LLAVA_API __attribute__ ((visibility ("default")))
 #    endif
 #else
 #    define LLAVA_API
 #endif
 struct clip_ctx;
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct llava_image_embed {
    float * embed;
    int n_image_pos;
 };
 /** sanity check for clip <-> llava embed size match */
 LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
 /** build an image embed from image file bytes */
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
 /** build an image embed from a path to an image filename */
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
 LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
 /** free an embedding made with llava_image_embed_make_* */
 /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
 LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -6,7 +6,7 @@ install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -7,7 +7,7 @@ Command line options:
 -   `--threads N`, `-t N`: Set the number of threads to use during generation.
 -   `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
 -   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
-   `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
+-   `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 -   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 -   `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -39,10 +39,6 @@
 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
 #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
 #define cudaDeviceGetMemPool hipDeviceGetMemPool
 #define cudaMemPoolAttrReleaseThreshold hipMemPoolAttrReleaseThreshold
 #define cudaMemPoolSetAttribute hipMemPoolSetAttribute
 #define cudaMemPool_t hipMemPool_t
 #define cudaDeviceProp hipDeviceProp_t
 #define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
@ -52,7 +48,6 @@
 #define cudaEvent_t hipEvent_t
 #define cudaEventDestroy hipEventDestroy
 #define cudaFree hipFree
 #define cudaFreeAsync hipFreeAsync
 #define cudaFreeHost hipHostFree
 #define cudaGetDevice hipGetDevice
 #define cudaGetDeviceCount hipGetDeviceCount
@ -60,7 +55,6 @@
 #define cudaGetErrorString hipGetErrorString
 #define cudaGetLastError hipGetLastError
 #define cudaMalloc hipMalloc
 #define cudaMallocFromPoolAsync hipMallocFromPoolAsync
 #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
 #define cudaMemcpy hipMemcpy
 #define cudaMemcpy2DAsync hipMemcpy2DAsync
@ -187,11 +181,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
    do {                                                                                \
        cudaError_t err_ = (err);                                                       \
        if (err_ != cudaSuccess) {                                                      \
-            int dev_id;                                                                     \
+            int id;                                                                     \
-            cudaGetDevice(&dev_id);                                                         \
+            cudaGetDevice(&id);                                                         \
            fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
                cudaGetErrorString(err_));                                              \
-            fprintf(stderr, "current device: %d\n", dev_id);                                \
+            fprintf(stderr, "current device: %d\n", id);                                \
            exit(1);                                                                    \
        }                                                                               \
    } while (0)
@ -201,11 +195,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
    do {                                                                                \
        cublasStatus_t err_ = (err);                                                    \
        if (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
-            int dev_id;                                                                     \
+            int id;                                                                     \
-            cudaGetDevice(&dev_id);                                                         \
+            cudaGetDevice(&id);                                                         \
            fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n",                         \
                    err_, __FILE__, __LINE__, cublasGetStatusString(err_));             \
-            fprintf(stderr, "current device: %d\n", dev_id);                                \
+            fprintf(stderr, "current device: %d\n", id);                                \
            exit(1);                                                                    \
        }                                                                               \
    } while (0)
@ -471,7 +465,6 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
 #define MAX_STREAMS 8
 static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
 static cudaMemPool_t g_cudaMemPools[GGML_CUDA_MAX_DEVICES] = { nullptr };
 struct ggml_tensor_extra_gpu {
    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
@ -5780,16 +5773,6 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
    return ptr;
 }
 static void * ggml_cuda_pool_malloc_async(size_t size, size_t * actual_size, int id, cudaStream_t stream) {
    if (g_cudaMemPools[id] == nullptr) {
        return ggml_cuda_pool_malloc(size, actual_size);
    }
    void *ptr;
    CUDA_CHECK(cudaMallocFromPoolAsync(&ptr, size, g_cudaMemPools[id], stream));
    *actual_size = size;
    return ptr;
 }
 static void ggml_cuda_pool_free(void * ptr, size_t size) {
    scoped_spin_lock lock(g_cuda_pool_lock);
    int id;
@ -5807,12 +5790,10 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
    CUDA_CHECK(cudaFree(ptr));
 }
 static bool g_cublas_loaded = false;
-static void ggml_cuda_pool_free_async(void * ptr, size_t actual_size, int id, cudaStream_t stream) {
+bool ggml_cublas_loaded(void) {
-    if (g_cudaMemPools[id] == nullptr) {
+    return g_cublas_loaded;
        return ggml_cuda_pool_free(ptr, actual_size);
    }
    CUDA_CHECK(cudaFreeAsync(ptr, stream));
 }
 void ggml_init_cublas() {
@ -5827,7 +5808,12 @@ void ggml_init_cublas() {
        CUDA_CHECK(cudaDeviceSynchronize());
 #endif
-        CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
+        if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
            initialized = true;
            g_cublas_loaded = false;
            return;
        }
        GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
        int64_t total_vram = 0;
 #if defined(GGML_CUDA_FORCE_MMQ)
@ -5869,19 +5855,13 @@ void ggml_init_cublas() {
            // create cublas handle
            CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
            CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
            // configure memory pool
            cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id);
            if (err == cudaSuccess) {
                size_t treshold = UINT64_MAX;
                CUDA_CHECK(cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold));
            }
        }
        // configure logging to stdout
        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
        initialized = true;
        g_cublas_loaded = true;
    }
 }
@ -6469,7 +6449,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
            GGML_ASSERT(to_fp16_cuda != nullptr);
            size_t ne = row_diff*ne00;
-            src0_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &src0_as, id, stream);
+            src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
            to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
        }
        const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
@ -6480,12 +6460,13 @@ inline void ggml_cuda_op_mul_mat_cublas(
            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
            GGML_ASSERT(to_fp16_cuda != nullptr);
            size_t ne = src1_ncols*ne10;
-            src1_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &src1_as, id, stream);
+            src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
            to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
        }
        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
-        size_t dst_f16_as = 0;
+
-        half * dst_f16 = (half *) ggml_cuda_pool_malloc_async(row_diff*src1_ncols * sizeof(half), &dst_f16_as, id, stream);
+        size_t dst_as = 0;
        half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
        const half alpha_f16 = 1.0f;
        const half beta_f16 = 0.0f;
@ -6503,15 +6484,14 @@ inline void ggml_cuda_op_mul_mat_cublas(
        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
        to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
-        if (dst_f16_as != 0) {
+        ggml_cuda_pool_free(dst_f16, dst_as);
            ggml_cuda_pool_free_async(dst_f16, dst_f16_as, id, stream);
        }
        if (src0_as != 0) {
-            ggml_cuda_pool_free_async(src0_as_f16, src0_as, id, stream);
+            ggml_cuda_pool_free(src0_as_f16, src0_as);
        }
        if (src1_as != 0) {
-            ggml_cuda_pool_free_async(src1_as_f16, src1_as, id, stream);
+            ggml_cuda_pool_free(src1_as_f16, src1_as);
        }
    }
    else {
@ -6521,7 +6501,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
        if (src0->type != GGML_TYPE_F32) {
            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
            GGML_ASSERT(to_fp32_cuda != nullptr);
-            src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc_async(row_diff*ne00 * sizeof(float), &src0_as, id, stream); // NOLINT
+            src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
        }
        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
@ -6538,7 +6518,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
                    &beta,  dst_dd_i,   ldc));
        if (src0_as != 0) {
-            ggml_cuda_pool_free_async(src0_ddq_as_f32, src0_as, id, stream);
+            ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
        }
    }
@ -6924,6 +6904,8 @@ static void ggml_cuda_op_mul_mat(
    int64_t  row_low[GGML_CUDA_MAX_DEVICES];
    int64_t row_high[GGML_CUDA_MAX_DEVICES];
    int used_devices = 0;
    for (int64_t id = 0; id < g_device_count; ++id) {
        // by default, use all rows
        row_low[id]  = 0;
@ -6951,6 +6933,8 @@ static void ggml_cuda_op_mul_mat(
            continue;
        }
        used_devices++;
        const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
        const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
@ -6961,22 +6945,21 @@ static void ggml_cuda_op_mul_mat(
            src0_dd[id] = (char *) src0_extra->data_device[id];
        } else {
            const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
-            src0_dd[id] = (char *) ggml_cuda_pool_malloc_async(ggml_nbytes(src0), &src0_as[id], id, stream);
+            src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
        }
        if (src1_on_device && src1_is_contiguous) {
            src1_ddf[id] = (float *) src1_extra->data_device[id];
        } else {
-            src1_ddf[id] = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(src1), &src1_asf[id], id, stream);
+            src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
        }
        if (convert_src1_to_q8_1) {
-            const size_t size_dst_ddq = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
+            src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
            src1_ddq[id] = (char *) ggml_cuda_pool_malloc_async(size_dst_ddq, &src1_asq[id], id, stream);
            if (src1_on_device && src1_is_contiguous) {
                quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
-                // CUDA_CHECK(cudaGetLastError());
+                CUDA_CHECK(cudaGetLastError());
            }
        }
@ -6984,18 +6967,18 @@ static void ggml_cuda_op_mul_mat(
            dst_dd[id] = (float *) dst_extra->data_device[id];
        } else {
            const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
-            dst_dd[id] = (float *) ggml_cuda_pool_malloc_async(size_dst_ddf, &dst_as[id], id,  stream);
+            dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
        }
    }
    // if multiple devices are used they need to wait for the main device
    // here an event is recorded that signals that the main device has finished calculating the input data
-    if (split && g_device_count > 1) {
+    if (split && used_devices > 1) {
        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
        CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
    }
-    const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
+    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
    for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
        const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
@ -7110,6 +7093,27 @@ static void ggml_cuda_op_mul_mat(
        }
    }
    for (int64_t id = 0; id < g_device_count; ++id) {
        if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
            continue;
        }
        CUDA_CHECK(ggml_cuda_set_device(id));
        // free buffers again when done
        if (src0_as[id] > 0) {
            ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
        }
        if (src1_asf[id] > 0) {
            ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
        }
        if (src1_asq[id] > 0) {
            ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
        }
        if (dst_as[id] > 0) {
            ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
        }
    }
    // main device waits for all other devices to be finished
    if (split && g_device_count > 1) {
        int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
@ -7117,6 +7121,9 @@ static void ggml_cuda_op_mul_mat(
        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
        for (int64_t id = 0; id < g_device_count; ++id) {
            if (row_low[id] == row_high[id]) {
                continue;
            }
            for (int64_t is = 0; is < is_max; ++is) {
                CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
            }
@ -7127,21 +7134,6 @@ static void ggml_cuda_op_mul_mat(
        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
        CUDA_CHECK(cudaDeviceSynchronize());
    }
    for (int64_t id = 0; id < g_device_count; ++id) {
        if (src0_as[id] > 0) {
            ggml_cuda_pool_free_async(src0_dd[id], src0_as[id], id, g_cudaStreams[id][0]);
        }
        if (src1_asf[id] > 0) {
            ggml_cuda_pool_free_async(src1_ddf[id], src1_asf[id], id, g_cudaStreams[id][0]);
        }
        if (src1_asq[id] > 0) {
            ggml_cuda_pool_free_async(src1_ddq[id], src1_asq[id], id, g_cudaStreams[id][0]);
        }
        if (dst_as[id] > 0) {
            ggml_cuda_pool_free_async(dst_dd[id], dst_as[id], id, g_cudaStreams[id][0]);
        }
    }
 }
 static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -7177,6 +7169,8 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
 }
 bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
    if (!g_cublas_loaded) return false;
    const int64_t ne10 = src1->ne[0];
    const int64_t ne0 = dst->ne[0];
@ -7328,11 +7322,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
    GGML_ASSERT(to_fp16_cuda != nullptr);
    size_t src1_as = 0;
-    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne1 * sizeof(half), &src1_as, id, main_stream);
+    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
    size_t dst_as = 0;
-    half * dst_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &dst_as, id, main_stream);
+    half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
    GGML_ASSERT(ne12 % ne02 == 0);
    GGML_ASSERT(ne13 % ne03 == 0);
@ -7386,8 +7380,8 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
        size_t ptrs_src_s = 0;
        size_t ptrs_dst_s = 0;
-        ptrs_src = (const void **) ggml_cuda_pool_malloc_async(2*ne23*sizeof(void *), &ptrs_src_s, id, main_stream);
+        ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
-        ptrs_dst = (      void **) ggml_cuda_pool_malloc_async(1*ne23*sizeof(void *), &ptrs_dst_s, id, main_stream);
+        ptrs_dst = (      void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
        dim3 block_dims(ne13, ne12);
        k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
@ -7400,6 +7394,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
                dst->nb[2], dst->nb[3],
                r2, r3);
        CUDA_CHECK(cudaGetLastError());
        CUBLAS_CHECK(
        cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
                ne01, ne11, ne10,
@ -7411,30 +7406,29 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
        if (ptrs_src_s != 0) {
-            ggml_cuda_pool_free_async(ptrs_src, ptrs_src_s, id, main_stream);
+            ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
        }
        if (ptrs_dst_s != 0) {
-            ggml_cuda_pool_free_async(ptrs_dst, ptrs_dst_s, id, main_stream);
+            ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
        }
    }
 #endif
    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
-    if (src1_as != 0) {
+
-        ggml_cuda_pool_free_async(src1_as_f16, src1_as, id, main_stream);
+    ggml_cuda_pool_free(src1_as_f16, src1_as);
-    }
+    ggml_cuda_pool_free(dst_f16, dst_as);
    if (dst_as != 0) {
        ggml_cuda_pool_free_async(dst_f16, dst_as, id, main_stream);
    }
 }
 static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const bool all_on_device =
-        (src0->backend == GGML_BACKEND_GPU) &&
+        (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
        (src1->backend == GGML_BACKEND_GPU) &&
        ( dst->backend == GGML_BACKEND_GPU);
    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
    int64_t min_compute_capability = INT_MAX;
    for (int64_t id = 0; id < g_device_count; ++id) {
        if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
@ -7456,13 +7450,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
-    if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+    if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
        // KQ single-batch
        ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
-    } else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
+    } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
        // KQV single-batch
        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
-    } else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
+    } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
        // KQ + KQV multi-batch
        ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
    } else if (src0->type == GGML_TYPE_F32) {
@ -7862,6 +7856,8 @@ void ggml_cuda_free_scratch() {
 }
 bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
    if (!g_cublas_loaded) return false;
    ggml_cuda_func_t func;
    const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -17,7 +17,12 @@ extern "C" {
 #define GGML_CUDA_MAX_DEVICES       16
 // Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
 GGML_API void   ggml_init_cublas(void);
 // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
 GGML_API bool   ggml_cublas_loaded(void);
 GGML_API void * ggml_cuda_host_malloc(size_t size);
 GGML_API void   ggml_cuda_host_free(void * ptr);
--- a/ggml.c
+++ b/ggml.c
@ -4970,8 +4970,13 @@ struct ggml_tensor * ggml_rope_back(
        int                   n_dims,
        int                   mode,
        int                   n_ctx,
        int                   n_orig_ctx,
        float                 freq_base,
        float                 freq_scale,
        float                 ext_factor,
        float                 attn_factor,
        float                 beta_fast,
        float                 beta_slow,
        float                 xpos_base,
        bool                  xpos_down) {
    GGML_ASSERT(ggml_is_vector(b));
@ -4988,11 +4993,15 @@ struct ggml_tensor * ggml_rope_back(
    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-    int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
+    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
-    memcpy(params + 4, &freq_base,  sizeof(float));
+    memcpy(params +  5, &freq_base,    sizeof(float));
-    memcpy(params + 5, &freq_scale, sizeof(float));
+    memcpy(params +  6, &freq_scale,   sizeof(float));
-    memcpy(params + 6, &xpos_base,  sizeof(float));
+    memcpy(params +  7, &ext_factor,   sizeof(float));
-    memcpy(params + 7, &xpos_down,  sizeof(bool));
+    memcpy(params +  8, &attn_factor,  sizeof(float));
    memcpy(params +  9, &beta_fast,    sizeof(float));
    memcpy(params + 10, &beta_slow,    sizeof(float));
    memcpy(params + 11, &xpos_base,    sizeof(float));
    memcpy(params + 12, &xpos_down,    sizeof(bool));
    ggml_set_op_params(result, params, sizeof(params));
    result->op   = GGML_OP_ROPE_BACK;
@ -10974,7 +10983,8 @@ static void ggml_compute_forward_rope_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
+        struct ggml_tensor * dst,
        const bool forward) {
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
@ -11033,6 +11043,11 @@ static void ggml_compute_forward_rope_f32(
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;
    // backward process uses inverse rotation by cos and sin.
    // cos and sin build a rotation matrix, where the inverse is the transpose.
    // this essentially just switches the sign of sin.
    const float sin_sign = forward ? 1.0f : -1.0f;
    const int32_t * pos = (const int32_t *) src1->data;
    for (int64_t i3 = 0; i3 < ne3; i3++) {
@ -11049,9 +11064,9 @@ static void ggml_compute_forward_rope_f32(
                    float block_theta = MAX(p - (n_ctx - 2), 0);
                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
+                        const float sin_theta = sinf(theta_base) * sin_sign;
                        const float cos_block_theta = cosf(block_theta);
-                        const float sin_block_theta = sinf(block_theta);
+                        const float sin_block_theta = sinf(block_theta) * sin_sign;
                        theta_base *= theta_scale;
                        block_theta *= theta_scale;
@ -11075,6 +11090,7 @@ static void ggml_compute_forward_rope_f32(
                        rope_yarn(
                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
                        );
                        sin_theta *= sin_sign;
                        // zeta scaling for xPos only:
                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
@ -11105,6 +11121,7 @@ static void ggml_compute_forward_rope_f32(
                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
                                &cos_theta, &sin_theta
                            );
                            sin_theta *= sin_sign;
                            theta_base *= theta_scale;
@ -11130,7 +11147,8 @@ static void ggml_compute_forward_rope_f16(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
+        struct ggml_tensor * dst,
        const bool forward) {
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
@ -11182,6 +11200,11 @@ static void ggml_compute_forward_rope_f16(
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;
    // backward process uses inverse rotation by cos and sin.
    // cos and sin build a rotation matrix, where the inverse is the transpose.
    // this essentially just switches the sign of sin.
    const float sin_sign = forward ? 1.0f : -1.0f;
    const int32_t * pos = (const int32_t *) src1->data;
    for (int64_t i3 = 0; i3 < ne3; i3++) {
@ -11198,9 +11221,9 @@ static void ggml_compute_forward_rope_f16(
                    float block_theta = MAX(p - (n_ctx - 2), 0);
                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
+                        const float sin_theta = sinf(theta_base) * sin_sign;
                        const float cos_block_theta = cosf(block_theta);
-                        const float sin_block_theta = sinf(block_theta);
+                        const float sin_block_theta = sinf(block_theta) * sin_sign;
                        theta_base *= theta_scale;
                        block_theta *= theta_scale;
@ -11224,6 +11247,7 @@ static void ggml_compute_forward_rope_f16(
                        rope_yarn(
                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
                        );
                        sin_theta *= sin_sign;
                        theta_base *= theta_scale;
@ -11250,6 +11274,7 @@ static void ggml_compute_forward_rope_f16(
                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
                                &cos_theta, &sin_theta
                            );
                            sin_theta *= sin_sign;
                            theta_base *= theta_scale;
@ -11279,11 +11304,11 @@ static void ggml_compute_forward_rope(
    switch (src0->type) {
        case GGML_TYPE_F16:
            {
-                ggml_compute_forward_rope_f16(params, src0, src1, dst);
+                ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
            } break;
        case GGML_TYPE_F32:
            {
-                ggml_compute_forward_rope_f32(params, src0, src1, dst);
+                ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
            } break;
        default:
            {
@ -11294,216 +11319,6 @@ static void ggml_compute_forward_rope(
 // ggml_compute_forward_rope_back
 static void ggml_compute_forward_rope_back_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
        struct ggml_tensor * dst) {
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
    // y = rope(x, src1)
    // dx = rope_back(dy, src1)
    // src0 is dy, src1 contains options
    float freq_base;
    float freq_scale;
    // these two only relevant for xPos RoPE:
    float xpos_base;
    bool xpos_down;
    //const int n_past = ((int32_t *) dst->op_params)[0];
    const int n_dims = ((int32_t *) dst->op_params)[1];
    const int mode   = ((int32_t *) dst->op_params)[2];
    const int n_ctx  = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
    memcpy(&xpos_base,  (int32_t *) dst->op_params + 6, sizeof(float));
    memcpy(&xpos_down,  (int32_t *) dst->op_params + 7, sizeof(bool));
    GGML_TENSOR_UNARY_OP_LOCALS
    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
    assert(nb0 == sizeof(float));
    const int ith = params->ith;
    const int nth = params->nth;
    const int nr = ggml_nrows(dst);
    // rows per thread
    const int dr = (nr + nth - 1)/nth;
    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);
    // row index used to determine which thread to use
    int ir = 0;
    const float theta_scale = powf(freq_base, -2.0f/n_dims);
    const bool is_neox = mode & 2;
    const int32_t * pos = (const int32_t *) src1->data;
    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = 0; i2 < ne2; i2++) {
            const int64_t p = pos[i2];
            for (int64_t i1 = 0; i1 < ne1; i1++) {
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;
                float theta_base = freq_scale * (float)p;
                if (!is_neox) {
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
                        const float cos_theta = cosf(theta_base);
                        const float sin_theta = sinf(theta_base);
                        // zeta scaling for xPos only:
                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
                        if (xpos_down) zeta = 1.0f / zeta;
                        theta_base *= theta_scale;
                        const float * const dy  = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                              float *       dx  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
                        const float dy0 = dy[0];
                        const float dy1 = dy[1];
                        dx[0] =   dy0*cos_theta*zeta + dy1*sin_theta*zeta;
                        dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
                    }
                } else {
                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
                            const float cos_theta = cosf(theta_base);
                            const float sin_theta = sinf(theta_base);
                            theta_base *= theta_scale;
                            const int64_t i0 = ib*n_dims + ic/2;
                            const float * const dy  = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                                  float *       dx  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
                            const float dy0 = dy[0];
                            const float dy1 = dy[n_dims/2];
                            dx[0]        =   dy0*cos_theta + dy1*sin_theta;
                            dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta;
                        }
                    }
                }
            }
        }
    }
 }
 static void ggml_compute_forward_rope_back_f16(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
        struct ggml_tensor * dst) {
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
    // y = rope(x, src1)
    // dx = rope_back(dy, src1)
    // src0 is dy, src1 contains options
    //const int n_past = ((int32_t *) dst->op_params)[0];
    const int n_dims = ((int32_t *) dst->op_params)[1];
    const int mode   = ((int32_t *) dst->op_params)[2];
    GGML_TENSOR_UNARY_OP_LOCALS
    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
    assert(nb0 == sizeof(ggml_fp16_t));
    const int ith = params->ith;
    const int nth = params->nth;
    const int nr = ggml_nrows(dst);
    // rows per thread
    const int dr = (nr + nth - 1)/nth;
    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);
    // row index used to determine which thread to use
    int ir = 0;
    const float theta_scale = powf(10000.0, -2.0f/n_dims);
    const bool is_neox = mode & 2;
    const int32_t * pos = (const int32_t *) src1->data;
    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = 0; i2 < ne2; i2++) {
            const int64_t p = pos[i2];
            for (int64_t i1 = 0; i1 < ne1; i1++) {
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;
                float theta_base = (float)p;
                if (!is_neox) {
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
                        const float cos_theta = cosf(theta_base);
                        const float sin_theta = sinf(theta_base);
                        theta_base *= theta_scale;
                        const ggml_fp16_t * const dy  = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                              ggml_fp16_t *       dx  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
                        const float dy0 = GGML_FP16_TO_FP32(dy[0]);
                        const float dy1 = GGML_FP16_TO_FP32(dy[1]);
                        dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
                        dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
                    }
                } else {
                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
                            const float cos_theta = cosf(theta_base);
                            const float sin_theta = sinf(theta_base);
                            theta_base *= theta_scale;
                            const int64_t i0 = ib*n_dims + ic/2;
                            const ggml_fp16_t * const dy  = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                                  ggml_fp16_t *       dx  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
                            const float dy0 = GGML_FP16_TO_FP32(dy[0]);
                            const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]);
                            dx[0]        = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
                            dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
                        }
                    }
                }
            }
        }
    }
 }
 static void ggml_compute_forward_rope_back(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
@ -11512,11 +11327,11 @@ static void ggml_compute_forward_rope_back(
    switch (src0->type) {
        case GGML_TYPE_F16:
            {
-                ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
+                ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
            } break;
        case GGML_TYPE_F32:
            {
-                ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
+                ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
            } break;
        default:
            {
@ -15559,17 +15374,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                // necessary for llama
                if (src0->grad) {
                    //const int n_past = ((int32_t *) tensor->op_params)[0];
-                    const int n_dims = ((int32_t *) tensor->op_params)[1];
+                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
-                    const int mode   = ((int32_t *) tensor->op_params)[2];
+                    const int mode       = ((int32_t *) tensor->op_params)[2];
-                    const int n_ctx  = ((int32_t *) tensor->op_params)[3];
+                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
-                    float freq_base;
+                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
-                    float freq_scale;
+                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
-                    float xpos_base;
+
-                    bool  xpos_down;
+                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
-                    memcpy(&freq_base,  (int32_t *) tensor->op_params + 4, sizeof(float));
+                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
-                    memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
+                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
-                    memcpy(&xpos_base,  (int32_t *) tensor->op_params + 6, sizeof(float));
+                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
-                    memcpy(&xpos_down,  (int32_t *) tensor->op_params + 7, sizeof(bool));
+                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));
                    src0->grad = ggml_add_or_set(ctx,
                            src0->grad,
@ -15579,8 +15397,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                n_dims,
                                mode,
                                n_ctx,
                                n_orig_ctx,
                                freq_base,
                                freq_scale,
                                ext_factor,
                                attn_factor,
                                beta_fast,
                                beta_slow,
                                xpos_base,
                                xpos_down),
                            zero_table);
@ -15590,17 +15413,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
            {
                if (src0->grad) {
                    //const int n_past = ((int32_t *) tensor->op_params)[0];
-                    const int n_dims = ((int32_t *) tensor->op_params)[1];
+                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
-                    const int mode   = ((int32_t *) tensor->op_params)[2];
+                    const int mode       = ((int32_t *) tensor->op_params)[2];
-                    const int n_ctx  = ((int32_t *) tensor->op_params)[3];
+                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
-                    float freq_base;
+                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
-                    float freq_scale;
+                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
-                    float xpos_base;
+
-                    bool  xpos_down;
+                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
-                    memcpy(&freq_base,  (int32_t *) tensor->op_params + 4, sizeof(float));
+                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
-                    memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
+                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
-                    memcpy(&xpos_base,  (int32_t *) tensor->op_params + 6, sizeof(float));
+                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
-                    memcpy(&xpos_down,  (int32_t *) tensor->op_params + 7, sizeof(bool));
+                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));
                    src0->grad = ggml_add_or_set(ctx,
                            src0->grad,
@ -15609,14 +15435,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                src1,
                                n_dims,
                                mode,
                                0,
                                n_ctx,
                                n_orig_ctx,
                                freq_base,
                                freq_scale,
-                                0.0f,
+                                ext_factor,
-                                1.0f,
+                                attn_factor,
-                                0.0f,
+                                beta_fast,
-                                0.0f,
+                                beta_slow,
                                xpos_base,
                                xpos_down,
                                false),
--- a/ggml.h
+++ b/ggml.h
@ -1372,8 +1372,13 @@ extern "C" {
            int                   n_dims,
            int                   mode,
            int                   n_ctx,
            int                   n_orig_ctx,
            float                 freq_base,
            float                 freq_scale,
            float                 ext_factor,
            float                 attn_factor,
            float                 beta_fast,
            float                 beta_slow,
            float                 xpos_base,
            bool                  xpos_down);
--- a/llama.cpp
+++ b/llama.cpp
@ -617,19 +617,37 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
 // llama helpers
 //
 inline void * llama_host_malloc(size_t n) {
 #ifdef GGML_USE_CUBLAS
-#   define llama_host_malloc(n)  ggml_cuda_host_malloc(n)
+    if (ggml_cublas_loaded()) {
-#   define llama_host_free(data) ggml_cuda_host_free(data)
+        return ggml_cuda_host_malloc(n);
    } else {
        return malloc(n);
    }
 #elif GGML_USE_METAL
-#   define llama_host_malloc(n)  ggml_metal_host_malloc(n)
+    return ggml_metal_host_malloc(n);
 #   define llama_host_free(data) ggml_metal_host_free(data)
 #elif GGML_USE_CPU_HBM
-#   define llama_host_malloc(n)  hbw_malloc(n)
+    return hbw_malloc(n);
 #   define llama_host_free(data) if (data != NULL) hbw_free(data)
 #else
-#   define llama_host_malloc(n)  malloc(n)
+    return malloc(n);
 #   define llama_host_free(data) free(data)
 #endif
 }
 inline void llama_host_free(void * ptr) {
 #ifdef GGML_USE_CUBLAS
    if (ggml_cublas_loaded()) {
        return ggml_cuda_host_free(ptr);
    } else {
        return free(ptr);
    }
 #elif GGML_USE_METAL
    return ggml_metal_host_free(ptr);
 #elif GGML_USE_CPU_HBM
    return hbw_free(ptr);
 #else
    return free(ptr);
 #endif
 }
 #if defined(_WIN32)
 static std::string llama_format_win_err(DWORD err) {
@ -1221,9 +1239,11 @@ struct llama_kv_cache {
        }
 #ifdef GGML_USE_CUBLAS
-        ggml_cuda_free_data(k);
+        if (ggml_cublas_loaded()) {
-        ggml_cuda_free_data(v);
+            ggml_cuda_free_data(k);
-#endif // GGML_USE_CUBLAS
+            ggml_cuda_free_data(v);
        }
 #endif
    }
 };
@ -1323,11 +1343,15 @@ struct llama_model {
        }
 #ifdef GGML_USE_CUBLAS
-        for (size_t i = 0; i < tensors_by_name.size(); ++i) {
+        if (ggml_cublas_loaded()) {
-            ggml_cuda_free_data(tensors_by_name[i].second);
+            for (size_t i = 0; i < tensors_by_name.size(); ++i) {
                ggml_cuda_free_data(tensors_by_name[i].second);
            }
            ggml_cuda_free_scratch();
        }
-        ggml_cuda_free_scratch();
+#endif
-#elif defined(GGML_USE_CLBLAST)
+
 #if defined(GGML_USE_CLBLAST)
        for (size_t i = 0; i < tensors_by_name.size(); ++i) {
            ggml_cl_free_data(tensors_by_name[i].second);
        }
@ -1439,23 +1463,26 @@ static bool llama_kv_cache_init(
    ggml_set_name(cache.v, "cache_v");
    (void) n_gpu_layers;
 #ifdef GGML_USE_CUBLAS
    size_t vram_kv_cache = 0;
-    if (n_gpu_layers > (int)n_layer + 1) {
+#ifdef GGML_USE_CUBLAS
-        ggml_cuda_assign_buffers_no_scratch(cache.v);
+    if (ggml_cublas_loaded()) {
-        LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
+        size_t vram_kv_cache = 0;
-        vram_kv_cache += ggml_nbytes(cache.v);
+
        if (n_gpu_layers > (int)n_layer + 1) {
            ggml_cuda_assign_buffers_no_scratch(cache.v);
            LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
            vram_kv_cache += ggml_nbytes(cache.v);
        }
        if (n_gpu_layers > (int)n_layer + 2) {
            ggml_cuda_assign_buffers_no_scratch(cache.k);
            LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
            vram_kv_cache += ggml_nbytes(cache.k);
        }
        if (vram_kv_cache > 0) {
            LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
        }
    }
-    if (n_gpu_layers > (int)n_layer + 2) {
+#endif
        ggml_cuda_assign_buffers_no_scratch(cache.k);
        LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
        vram_kv_cache += ggml_nbytes(cache.k);
    }
    if (vram_kv_cache > 0) {
        LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
    }
 #endif // GGML_USE_CUBLAS
    return true;
 }
@ -2552,18 +2579,22 @@ static void llm_load_tensors(
    }
    (void) main_gpu;
    enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
    enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
 #ifdef GGML_USE_CUBLAS
-    LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
+    if (ggml_cublas_loaded()) {
-    ggml_cuda_set_main_device(main_gpu);
+        LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
-#define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_GPU
+        ggml_cuda_set_main_device(main_gpu);
-#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
+
        llama_backend_offload = GGML_BACKEND_GPU;
        llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
    }
 #elif defined(GGML_USE_CLBLAST)
-    LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
+        LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
-#define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_GPU
+        llama_backend_offload = GGML_BACKEND_GPU;
-#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
+        llama_backend_offload_split = GGML_BACKEND_GPU;
 #else
 #define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_CPU
 #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
 #endif
    // prepare memory for the weights
@ -2590,12 +2621,12 @@ static void llm_load_tensors(
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                            backend_norm = LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = llama_backend_offload;
 #else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
-                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
                            backend_output = GGML_BACKEND_CPU;
@ -2619,8 +2650,8 @@ static void llm_load_tensors(
                    model.layers.resize(n_layer);
                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
                        auto & layer = model.layers[i];
@ -2656,12 +2687,12 @@ static void llm_load_tensors(
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                            backend_norm = LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = llama_backend_offload;
 #else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
-                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
                            backend_output = GGML_BACKEND_CPU;
@ -2685,8 +2716,8 @@ static void llm_load_tensors(
                    model.layers.resize(n_layer);
                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
                        auto & layer = model.layers[i];
@ -2726,12 +2757,12 @@ static void llm_load_tensors(
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                            backend_norm = LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = llama_backend_offload;
 #else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
-                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
                            backend_output = GGML_BACKEND_CPU;
@ -2757,8 +2788,8 @@ static void llm_load_tensors(
                    model.layers.resize(n_layer);
                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
                        auto & layer = model.layers[i];
@ -2803,12 +2834,12 @@ static void llm_load_tensors(
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                            backend_norm = LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = llama_backend_offload;
 #else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
-                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
                            backend_output = GGML_BACKEND_CPU;
@ -2834,8 +2865,8 @@ static void llm_load_tensors(
                    model.layers.resize(n_layer);
                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
                        auto & layer = model.layers[i];
@ -2880,12 +2911,12 @@ static void llm_load_tensors(
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                            backend_norm = LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = llama_backend_offload;
 #else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
-                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
                            backend_output = GGML_BACKEND_CPU;
@ -2908,8 +2939,8 @@ static void llm_load_tensors(
                    const int i_gpu_start = n_layer - n_gpu_layers;
                    model.layers.resize(n_layer);
                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
                        auto & layer = model.layers[i];
                        layer.attn_norm     = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
                        layer.attn_norm_b   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias",   i), {n_embd}, backend);
@ -2946,12 +2977,12 @@ static void llm_load_tensors(
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                            backend_norm = LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = llama_backend_offload;
 #else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
-                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
                            backend_output = GGML_BACKEND_CPU;
@ -2977,8 +3008,8 @@ static void llm_load_tensors(
                    model.layers.resize(n_layer);
                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
                        auto & layer = model.layers[i];
@ -3024,12 +3055,12 @@ static void llm_load_tensors(
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                            backend_norm = LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = llama_backend_offload;
 #else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
-                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
                            backend_output = GGML_BACKEND_CPU;
@ -3053,8 +3084,8 @@ static void llm_load_tensors(
                    model.layers.resize(n_layer);
                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
                        auto & layer = model.layers[i];
@ -5397,11 +5428,12 @@ static int llama_decode_internal(
    // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
    const bool full_offload_supported =
-        model.arch == LLM_ARCH_LLAMA    ||
+        model.arch == LLM_ARCH_LLAMA      ||
-        model.arch == LLM_ARCH_BAICHUAN ||
+        model.arch == LLM_ARCH_BAICHUAN   ||
-        model.arch == LLM_ARCH_FALCON   ||
+        model.arch == LLM_ARCH_FALCON     ||
-        model.arch == LLM_ARCH_REFACT   ||
+        model.arch == LLM_ARCH_REFACT     ||
-        model.arch == LLM_ARCH_MPT      ||
+        model.arch == LLM_ARCH_MPT        ||
        model.arch == LLM_ARCH_STARCODER  ||
        model.arch == LLM_ARCH_STABLELM;
    const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;