Merge branch 'ggerganov:master' into server-chat-templates
This commit is contained in:
commit
04a071108e
10 changed files with 166 additions and 79 deletions
|
@ -24,11 +24,12 @@
|
|||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||
}
|
||||
},
|
||||
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
||||
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
|
||||
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
|
||||
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
||||
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
|
||||
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
|
||||
{ "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } },
|
||||
|
||||
{
|
||||
"name": "arm64-windows-msvc", "hidden": true,
|
||||
|
@ -57,25 +58,28 @@
|
|||
}
|
||||
},
|
||||
|
||||
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
|
||||
{ "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "arm64-apple-clang-debug" , "inherits": [ "base", "arm64-apple-clang", "debug" ] },
|
||||
{ "name": "arm64-apple-clang-release" , "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
|
||||
{ "name": "arm64-apple-clang+static-release" , "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
|
||||
{ "name": "arm64-apple-clang-debug", "inherits": [ "base", "arm64-apple-clang", "debug" ] },
|
||||
{ "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
|
||||
{ "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||
{ "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
||||
{ "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] },
|
||||
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
|
||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
|
||||
{ "name": "x64-windows-sycl-debug", "inherits": [ "sycl-base", "debug" ] },
|
||||
{ "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
|
||||
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
|
||||
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
|
||||
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] },
|
||||
|
||||
{ "name": "x64-windows-vulkan-debug", "inherits": [ "base", "vulkan", "debug" ] },
|
||||
{ "name": "x64-windows-vulkan-release", "inherits": [ "base", "vulkan", "release" ] }
|
||||
]
|
||||
}
|
||||
|
|
8
Makefile
8
Makefile
|
@ -359,6 +359,10 @@ ifdef LLAMA_SERVER_SSL
|
|||
MK_LDFLAGS += -lssl -lcrypto
|
||||
endif
|
||||
|
||||
ifndef GGML_NO_CPU_AARCH64
|
||||
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
|
||||
endif
|
||||
|
||||
# warnings
|
||||
WARN_FLAGS = \
|
||||
-Wall \
|
||||
|
@ -940,10 +944,6 @@ ggml/src/ggml-cuda/%.o: \
|
|||
$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
|
||||
endif # GGML_MUSA
|
||||
|
||||
ifndef GGML_NO_CPU_AARCH64
|
||||
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
|
||||
endif
|
||||
|
||||
ifdef GGML_METAL
|
||||
MK_CPPFLAGS += -DGGML_USE_METAL
|
||||
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
|
||||
|
|
|
@ -8,19 +8,42 @@
|
|||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
|
||||
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
||||
block_q4_0x4 out;
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
out.d[i] = in[i].d;
|
||||
}
|
||||
|
||||
for (int i = 0; i < QK4_0 * 2; i++) {
|
||||
int src_offset = (i / (4 * blck_size_interleave)) * blck_size_interleave;
|
||||
int src_id = (i % (4 * blck_size_interleave)) / blck_size_interleave;
|
||||
src_offset += (i % blck_size_interleave);
|
||||
const int end = QK4_0 * 2 / blck_size_interleave;
|
||||
|
||||
out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
|
||||
if (blck_size_interleave == 8) {
|
||||
const uint64_t xor_mask = 0x8888888888888888ULL;
|
||||
for (int i = 0; i < end; ++i) {
|
||||
int src_id = i % 4;
|
||||
int src_offset = (i / 4) * blck_size_interleave;
|
||||
int dst_offset = i * blck_size_interleave;
|
||||
|
||||
uint64_t elems;
|
||||
// Using memcpy to avoid unaligned memory accesses
|
||||
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
||||
elems ^= xor_mask;
|
||||
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
||||
}
|
||||
} else if (blck_size_interleave == 4) {
|
||||
const uint32_t xor_mask = 0x88888888;
|
||||
for (int i = 0; i < end; ++i) {
|
||||
int src_id = i % 4;
|
||||
int src_offset = (i / 4) * blck_size_interleave;
|
||||
int dst_offset = i * blck_size_interleave;
|
||||
|
||||
uint32_t elems;
|
||||
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
|
||||
elems ^= xor_mask;
|
||||
memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
return out;
|
||||
|
@ -30,19 +53,25 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_in
|
|||
// returns an interleaved block_q4_0x8
|
||||
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
|
||||
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
|
||||
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
|
||||
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
|
||||
block_q4_0x8 out;
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
out.d[i] = in[i].d;
|
||||
}
|
||||
|
||||
for (int i = 0; i < QK4_0 * 4; i++) {
|
||||
int src_offset = (i / (8 * blck_size_interleave)) * blck_size_interleave;
|
||||
int src_id = (i % (8 * blck_size_interleave)) / blck_size_interleave;
|
||||
src_offset += (i % blck_size_interleave);
|
||||
const int end = QK4_0 * 4 / blck_size_interleave;
|
||||
const uint64_t xor_mask = 0x8888888888888888ULL;
|
||||
|
||||
out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
|
||||
for (int i = 0; i < end; ++i) {
|
||||
int src_id = i % 8;
|
||||
int src_offset = (i / 8) * blck_size_interleave;
|
||||
int dst_offset = i * blck_size_interleave;
|
||||
|
||||
uint64_t elems;
|
||||
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
||||
elems ^= xor_mask;
|
||||
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
||||
}
|
||||
|
||||
return out;
|
||||
|
@ -71,11 +100,11 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
|
|||
}
|
||||
|
||||
if (nrows_interleaved == 8) {
|
||||
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave, 0x88);
|
||||
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave);
|
||||
out_ptr = (block_q4_0x8 *) out_ptr + 1;
|
||||
}
|
||||
else if (nrows_interleaved == 4) {
|
||||
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave, 0x88);
|
||||
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave);
|
||||
out_ptr = (block_q4_0x4 *) out_ptr + 1;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -143,14 +143,23 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
|
|||
if (GGML_AVX512_VBMI)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
||||
endif()
|
||||
endif()
|
||||
if (GGML_AVX512_VNNI)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
list(APPEND ARCH_FLAGS -mavx512vnni)
|
||||
endif()
|
||||
endif()
|
||||
if (GGML_AVX512_BF16)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
list(APPEND ARCH_FLAGS -mavx512bf16)
|
||||
endif()
|
||||
endif()
|
||||
if (GGML_AMX_TILE)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
|
||||
|
@ -211,10 +220,13 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
|
|||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||
message(STATUS "PowerPC detected")
|
||||
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1"
|
||||
OUTPUT_VARIABLE POWER10_M)
|
||||
string(FIND ${POWER10_M} "POWER10" substring_index)
|
||||
if(${substring_index} GREATER_EQUAL 0)
|
||||
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
|
||||
string(FIND "${POWER10_M}" "POWER10" substring_index)
|
||||
if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
|
||||
set(substring_index -1)
|
||||
endif()
|
||||
|
||||
if (${substring_index} GREATER_EQUAL 0)
|
||||
list(APPEND ARCH_FLAGS -mcpu=power10)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
||||
|
|
|
@ -3387,19 +3387,42 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||
}
|
||||
|
||||
// FIXME: this code is duplicated from ggml-aarch64.c
|
||||
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
|
||||
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
||||
block_q4_0x4 out;
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
out.d[i] = in[i].d;
|
||||
}
|
||||
|
||||
for (int i = 0; i < QK4_0 * 2; i++) {
|
||||
int src_offset = (i / (4 * blck_size_interleave)) * blck_size_interleave;
|
||||
int src_id = (i % (4 * blck_size_interleave)) / blck_size_interleave;
|
||||
src_offset += (i % blck_size_interleave);
|
||||
const int end = QK4_0 * 2 / blck_size_interleave;
|
||||
|
||||
out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
|
||||
if (blck_size_interleave == 8) {
|
||||
const uint64_t xor_mask = 0x8888888888888888ULL;
|
||||
for (int i = 0; i < end; ++i) {
|
||||
int src_id = i % 4;
|
||||
int src_offset = (i / 4) * blck_size_interleave;
|
||||
int dst_offset = i * blck_size_interleave;
|
||||
|
||||
uint64_t elems;
|
||||
// Using memcpy to avoid unaligned memory accesses
|
||||
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
||||
elems ^= xor_mask;
|
||||
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
||||
}
|
||||
} else if (blck_size_interleave == 4) {
|
||||
const uint32_t xor_mask = 0x88888888;
|
||||
for (int i = 0; i < end; ++i) {
|
||||
int src_id = i % 4;
|
||||
int src_offset = (i / 4) * blck_size_interleave;
|
||||
int dst_offset = i * blck_size_interleave;
|
||||
|
||||
uint32_t elems;
|
||||
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
|
||||
elems ^= xor_mask;
|
||||
memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
return out;
|
||||
|
@ -3409,19 +3432,25 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_in
|
|||
// returns an interleaved block_q4_0x8
|
||||
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
|
||||
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
|
||||
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
|
||||
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
|
||||
block_q4_0x8 out;
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
out.d[i] = in[i].d;
|
||||
}
|
||||
|
||||
for (int i = 0; i < QK4_0 * 4; i++) {
|
||||
int src_offset = (i / (8 * blck_size_interleave)) * blck_size_interleave;
|
||||
int src_id = (i % (8 * blck_size_interleave)) / blck_size_interleave;
|
||||
src_offset += (i % blck_size_interleave);
|
||||
const int end = QK4_0 * 4 / blck_size_interleave;
|
||||
const uint64_t xor_mask = 0x8888888888888888ULL;
|
||||
|
||||
out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
|
||||
for (int i = 0; i < end; ++i) {
|
||||
int src_id = i % 8;
|
||||
int src_offset = (i / 8) * blck_size_interleave;
|
||||
int dst_offset = i * blck_size_interleave;
|
||||
|
||||
uint64_t elems;
|
||||
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
||||
elems ^= xor_mask;
|
||||
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
||||
}
|
||||
|
||||
return out;
|
||||
|
@ -3449,7 +3478,7 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
|
|||
for (int i = 0; i < nrows_interleaved; i++) {
|
||||
dst_tmp[i] = src[x + i * nblocks];
|
||||
}
|
||||
*dst++ = make_block_q4_0x4(dst_tmp, interleave_block, 0x88);
|
||||
*dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
|
||||
}
|
||||
src += nrows_interleaved * nblocks;
|
||||
}
|
||||
|
@ -3480,7 +3509,7 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block,
|
|||
for (int i = 0; i < nrows_interleaved; i++ ) {
|
||||
dst_tmp[i] = src[x + i * nblocks];
|
||||
}
|
||||
*dst++ = make_block_q4_0x8(dst_tmp, interleave_block, 0x88);
|
||||
*dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
|
||||
}
|
||||
src += nrows_interleaved * nblocks;
|
||||
}
|
||||
|
|
|
@ -1763,7 +1763,8 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|||
fp16 = fp16 && vk12_features.shaderFloat16;
|
||||
|
||||
std::string device_name = props2.properties.deviceName.data();
|
||||
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
|
||||
GGML_LOG_DEBUG("ggml_vulkan: %d = %s (%s) | uma: %d | fp16: %d | warp size: %d\n",
|
||||
idx, device_name.c_str(), driver_props.driverName, uma, fp16, subgroup_size);
|
||||
|
||||
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
|
||||
std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
|
||||
|
@ -1821,8 +1822,7 @@ void ggml_vk_instance_init() {
|
|||
};
|
||||
validation_features.setPNext(nullptr);
|
||||
instance_create_info.setPNext(&validation_features);
|
||||
|
||||
std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
||||
GGML_LOG_DEBUG("ggml_vulkan: Validation layers enabled\n");
|
||||
}
|
||||
vk_instance.instance = vk::createInstance(instance_create_info);
|
||||
|
||||
|
@ -1936,8 +1936,8 @@ void ggml_vk_instance_init() {
|
|||
vk_instance.device_indices.push_back(0);
|
||||
}
|
||||
}
|
||||
GGML_LOG_DEBUG("ggml_vulkan: Found %d Vulkan devices:\n", vk_instance.device_indices.size());
|
||||
|
||||
std::cerr << "ggml_vulkan: Found " << vk_instance.device_indices.size() << " Vulkan devices:" << std::endl;
|
||||
|
||||
for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
|
||||
ggml_vk_print_gpu_info(i);
|
||||
|
|
|
@ -49,6 +49,14 @@
|
|||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define m512bh(p) p
|
||||
#define m512i(p) p
|
||||
#else
|
||||
#define m512bh(p) (__m512bh)(p)
|
||||
#define m512i(p) (__m512i)(p)
|
||||
#endif
|
||||
|
||||
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
||||
float ggml_table_f32_f16[1 << 16];
|
||||
|
||||
|
|
|
@ -19,22 +19,22 @@ logger = logging.getLogger("compare-llama-bench")
|
|||
|
||||
# Properties by which to differentiate results per commit:
|
||||
KEY_PROPERTIES = [
|
||||
"cpu_info", "gpu_info", "n_gpu_layers", "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
|
||||
"blas", "model_filename", "model_type", "n_batch", "n_ubatch", "embeddings", "n_threads",
|
||||
"type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
|
||||
"cpu_info", "gpu_info", "backends", "n_gpu_layers", "model_filename", "model_type", "n_batch", "n_ubatch",
|
||||
"embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v", "use_mmap", "no_kv_offload",
|
||||
"split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
|
||||
]
|
||||
|
||||
# Properties that are boolean and are converted to Yes/No for the table:
|
||||
BOOL_PROPERTIES = ["cuda", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "embeddings", "use_mmap", "no_kv_offload", "flash_attn"]
|
||||
BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"]
|
||||
|
||||
# Header names for the table:
|
||||
PRETTY_NAMES = {
|
||||
"cuda": "CUDA", "vulkan": "Vulkan", "kompute": "Kompute", "metal": "Metal", "sycl": "SYCL", "rpc": "RPC",
|
||||
"gpu_blas": "GPU BLAS", "blas": "BLAS", "cpu_info": "CPU", "gpu_info": "GPU", "model_filename": "File", "model_type": "Model",
|
||||
"model_size": "Model Size [GiB]", "model_n_params": "Num. of Par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
|
||||
"n_threads": "Threads", "type_k": "K type", "type_v": "V type", "n_gpu_layers": "GPU layers", "split_mode": "Split mode",
|
||||
"main_gpu": "Main GPU", "no_kv_offload": "NKVO", "flash_attn": "FlashAttention", "tensor_split": "Tensor split",
|
||||
"use_mmap": "Use mmap", "embeddings": "Embeddings",
|
||||
"cpu_info": "CPU", "gpu_info": "GPU", "backends": "Backends", "n_gpu_layers": "GPU layers",
|
||||
"model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]",
|
||||
"model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
|
||||
"embeddings": "Embeddings", "cpu_mask": "CPU mask", "cpu_strict": "CPU strict", "poll": "Poll",
|
||||
"n_threads": "Threads", "type_k": "K type", "type_v": "V type", "split_mode": "Split mode", "main_gpu": "Main GPU",
|
||||
"no_kv_offload": "NKVO", "flash_attn": "FlashAttention", "tensor_split": "Tensor split", "use_mmap": "Use mmap",
|
||||
}
|
||||
|
||||
DEFAULT_SHOW = ["model_type"] # Always show these properties by default.
|
||||
|
|
|
@ -1 +1 @@
|
|||
8a3d799484d861748f86eb87c8314fa2dbccc254
|
||||
9d0708e863f3aa2fc1eb0b75d433303c30bd0dbc
|
||||
|
|
|
@ -2907,9 +2907,15 @@ struct llama_model {
|
|||
// for quantize-stats only
|
||||
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
||||
|
||||
int64_t t_load_us = 0;
|
||||
int64_t t_load_us = 0;
|
||||
int64_t t_start_us = 0;
|
||||
|
||||
// total number of parameters in the model
|
||||
uint64_t n_elements = 0;
|
||||
|
||||
// total size of all the tensors in the model in bytes
|
||||
size_t n_bytes = 0;
|
||||
|
||||
// keep track of loaded lora adapters
|
||||
std::set<struct llama_lora_adapter *> lora_adapters;
|
||||
|
||||
|
@ -4275,8 +4281,8 @@ struct llama_model_loader {
|
|||
int n_tensors = 0;
|
||||
int n_created = 0;
|
||||
|
||||
int64_t n_elements = 0;
|
||||
size_t n_bytes = 0;
|
||||
uint64_t n_elements = 0;
|
||||
size_t n_bytes = 0;
|
||||
|
||||
bool use_mmap = false;
|
||||
bool check_tensors;
|
||||
|
@ -5344,6 +5350,11 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
|||
}
|
||||
}
|
||||
|
||||
static void llm_load_stats(llama_model_loader & ml, llama_model & model) {
|
||||
model.n_elements = ml.n_elements;
|
||||
model.n_bytes = ml.n_bytes;
|
||||
}
|
||||
|
||||
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
||||
model.arch = ml.get_arch();
|
||||
if (model.arch == LLM_ARCH_UNKNOWN) {
|
||||
|
@ -9256,6 +9267,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|||
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
|
||||
}
|
||||
|
||||
llm_load_stats(ml, model);
|
||||
llm_load_print_meta(ml, model);
|
||||
|
||||
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
|
||||
|
@ -18601,6 +18613,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
llama_model model;
|
||||
llm_load_arch(ml, model);
|
||||
llm_load_hparams(ml, model);
|
||||
llm_load_stats(ml, model);
|
||||
|
||||
struct quantize_state_internal qs(model, params);
|
||||
|
||||
|
@ -19953,19 +19966,11 @@ int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t bu
|
|||
}
|
||||
|
||||
uint64_t llama_model_size(const struct llama_model * model) {
|
||||
uint64_t size = 0;
|
||||
for (const auto & it : model->tensors_by_name) {
|
||||
size += ggml_nbytes(it.second);
|
||||
}
|
||||
return size;
|
||||
return model->n_bytes;
|
||||
}
|
||||
|
||||
uint64_t llama_model_n_params(const struct llama_model * model) {
|
||||
uint64_t nparams = 0;
|
||||
for (const auto & it : model->tensors_by_name) {
|
||||
nparams += ggml_nelements(it.second);
|
||||
}
|
||||
return nparams;
|
||||
return model->n_elements;
|
||||
}
|
||||
|
||||
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue