This commit is contained in:
Djip007 2024-12-01 16:38:13 +03:00 committed by GitHub
commit 905810f91a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 743 additions and 127 deletions

View file

@ -96,6 +96,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
set(GGML_LLAMAFILE_DEFAULT ON)
endif()
if (NOT DEFINED GGML_OPENMP_SIMD)
set(GGML_OPENMP_SIMD_DEFAULT ON)
endif()
if (NOT DEFINED GGML_AMX)
set(GGML_AMX ON)
endif()

View file

@ -138,6 +138,10 @@ GGML_NO_OPENMP := 1
DEPRECATE_WARNING := 1
endif
ifdef LLAMA_NO_OPENMP_SIMD
GGML_NO_OPENMP_SIMD := 1
endif
ifdef LLAMA_NO_METAL
GGML_NO_METAL := 1
DEPRECATE_WARNING := 1
@ -542,6 +546,12 @@ ifndef GGML_NO_OPENMP
MK_CXXFLAGS += -fopenmp
endif # GGML_NO_OPENMP
ifndef GGML_NO_OPENMP_SIMD
MK_CPPFLAGS += -DGGML_USE_OPENMP_SIMD
MK_CFLAGS += -fopenmp-simd
MK_CXXFLAGS += -fopenmp-simd
endif # GGML_NO_OPENMP_SIMD
ifdef GGML_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
@ -948,12 +958,14 @@ OBJ_GGML = \
$(DIR_GGML)/src/ggml-alloc.o \
$(DIR_GGML)/src/ggml-backend.o \
$(DIR_GGML)/src/ggml-backend-reg.o \
$(DIR_GGML)/src/ggml-fp8.o \
$(DIR_GGML)/src/ggml-opt.o \
$(DIR_GGML)/src/ggml-quants.o \
$(DIR_GGML)/src/ggml-threading.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-fp8.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
$(OBJ_GGML_EXT)
@ -1094,17 +1106,10 @@ DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
# Default target
all: $(BUILD_TARGETS)
# force c++ build for source file that have same name as c file
# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
# g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
ggml/src/ggml-cpu/ggml-cpu.cpp \
ggml/include/ggml-backend.h \
ggml/include/ggml.h \
ggml/include/ggml-alloc.h \
ggml/src/ggml-backend-impl.h \
ggml/include/ggml-cpu.h \
ggml/src/ggml-impl.h
$(CXX) $(CXXFLAGS) -c $< -o $@
$(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp
$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
# Rules for building object files
$(DIR_GGML)/%.o: $(DIR_GGML)/%.c

View file

@ -20,6 +20,8 @@ var sources = [
"ggml/src/ggml-cpu/ggml-cpu-quants.c",
"ggml/src/ggml-threading.cpp",
"ggml/src/ggml-quants.c",
"ggml/src/ggml-fp8.cpp",
"ggml/src/ggml-cpu/ggml-cpu-fp8.cpp",
]
var resources: [Resource] = []
@ -88,5 +90,5 @@ let package = Package(
linkerSettings: linkerSettings
)
],
cxxLanguageStandard: .cxx11
cxxLanguageStandard: .cxx17
)

View file

@ -1797,9 +1797,9 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
total_seconds = total_seconds % (60*60);
}
LOG("%.2f minutes\n", total_seconds / 60.0);
LOG("\n");
LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
}
LOG("\n");
LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
const int first = n_ctx/2;
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);

View file

@ -51,6 +51,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "E4M3_Q", LLAMA_FTYPE_MOSTLY_E4M3_Q, "12.21G, 0.0050 kld @ Mistral-Nemo", },
{ "E3M4_Q", LLAMA_FTYPE_MOSTLY_E3M4_Q, "12.21G, 0.0016 kld @ Mistral-Nemo", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },

View file

@ -62,6 +62,10 @@ if (NOT GGML_LLAMAFILE_DEFAULT)
set(GGML_LLAMAFILE_DEFAULT OFF)
endif()
if (NOT GGML_OPENMP_SIMD_DEFAULT)
set(GGML_OPENMP_SIMD_DEFAULT OFF)
endif()
if (NOT GGML_CUDA_GRAPHS_DEFAULT)
set(GGML_CUDA_GRAPHS_DEFAULT OFF)
endif()
@ -112,6 +116,7 @@ option(GGML_LASX "ggml: enable lasx" ON)
option(GGML_LSX "ggml: enable lsx" ON)
option(GGML_RVV "ggml: enable rvv" ON)
option(GGML_SVE "ggml: enable SVE" OFF)
option(GGML_OPENMP_SIMD "ggml: enable OPENMP_SIMD" ${GGML_OPENMP_SIMD_DEFAULT})
if (WIN32)
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")

View file

@ -392,6 +392,10 @@ extern "C" {
GGML_TYPE_IQ4_NL_4_4 = 36,
// GGML_TYPE_IQ4_NL_4_8 = 37,
// GGML_TYPE_IQ4_NL_8_8 = 38,
GGML_TYPE_E5M2 = 39,
GGML_TYPE_E4M3 = 40,
GGML_TYPE_E4M3_Q = 41,
GGML_TYPE_E3M4_Q = 42,
GGML_TYPE_COUNT,
};
@ -436,6 +440,10 @@ extern "C" {
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
GGML_FTYPE_MOSTLY_E5M2 = 28, // except 1d tensors
GGML_FTYPE_MOSTLY_E4M3 = 29, // except 1d tensors
GGML_FTYPE_MOSTLY_E4M3_Q = 30, // except 1d tensors
GGML_FTYPE_MOSTLY_E3M4_Q = 31, // except 1d tensors
};
// available tensor operations:

View file

@ -222,7 +222,10 @@ add_library(ggml-base
ggml-quants.c
ggml-quants.h
ggml-aarch64.c
ggml-aarch64.h)
ggml-aarch64.h
ggml-fp8.cpp
ggml-fp8.h
)
target_include_directories(ggml-base PRIVATE .)

View file

@ -6,7 +6,20 @@
typedef uint16_t ggml_half;
typedef uint32_t ggml_half2;
#define GGML_COMMON_AGGR
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S
#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CPP)
#include <cstdint>
typedef uint16_t ggml_half;
typedef uint32_t ggml_half2;
// std-c++ allow anonymous unions but some compiler warn on it
#define GGML_COMMON_AGGR_U data
// std-c++ do not allow it.
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_METAL)
@ -15,7 +28,8 @@ typedef uint32_t ggml_half2;
typedef half ggml_half;
typedef half2 ggml_half2;
#define GGML_COMMON_AGGR
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S
#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CUDA)
@ -29,7 +43,8 @@ typedef half2 ggml_half2;
typedef half ggml_half;
typedef half2 ggml_half2;
#define GGML_COMMON_AGGR data
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_HIP)
@ -39,7 +54,8 @@ typedef half2 ggml_half2;
typedef half ggml_half;
typedef half2 ggml_half2;
#define GGML_COMMON_AGGR data
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_SYCL)
@ -49,7 +65,8 @@ typedef half2 ggml_half2;
typedef sycl::half ggml_half;
typedef sycl::half2 ggml_half2;
#define GGML_COMMON_AGGR data
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL
#endif
@ -154,9 +171,9 @@ typedef struct {
struct {
ggml_half d; // delta
ggml_half m; // min
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
uint8_t qs[QK4_1 / 2]; // nibbles / quants
} block_q4_1;
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
@ -175,9 +192,9 @@ typedef struct {
struct {
ggml_half d; // delta
ggml_half m; // min
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_1 / 2]; // nibbles / quants
} block_q5_1;
@ -196,9 +213,9 @@ typedef struct {
struct {
ggml_half d; // delta
ggml_half s; // d * sum(qs[i])
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 ds;
};
} GGML_COMMON_AGGR_U;
int8_t qs[QK8_1]; // quants
} block_q8_1;
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
@ -261,9 +278,9 @@ typedef struct {
struct {
ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
} block_q2_K;
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
@ -288,9 +305,9 @@ typedef struct {
struct {
ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
} block_q4_K;
@ -305,9 +322,9 @@ typedef struct {
struct {
ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
@ -424,6 +441,24 @@ typedef struct {
} block_iq4_nlx4;
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
// fp8 support
// - fp8 simple type
typedef struct { uint8_t bits; } ggml_e5m2_t;
typedef struct { uint8_t bits; } ggml_e4m3_t;
// - fp8 with bloc delta => 8.125 bpw
typedef struct {
float d; // delta
uint8_t qs[QK_K];
} block_e4m3_q;
static_assert(sizeof(block_e4m3_q) == sizeof(float) + QK_K, "wrong block_e4m3_q block size/padding");
typedef struct {
float d; // delta
uint8_t qs[QK_K];
} block_e3m4_q;
static_assert(sizeof(block_e3m4_q) == sizeof(float) + QK_K, "wrong block_e3m4_q block size/padding");
#endif // GGML_COMMON_DECL
#endif // GGML_COMMON_DECL
@ -437,6 +472,13 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() };
#define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_CPP)
#include <cstdint>
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() };
#define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_METAL)
#include <metal_stdlib>

View file

@ -7,6 +7,8 @@ list (APPEND GGML_CPU_SOURCES
ggml-cpu-aarch64.h
ggml-cpu-quants.c
ggml-cpu-quants.h
ggml-cpu-fp8.cpp
ggml-cpu-fp8.h
amx/amx.cpp
amx/amx.h
amx/mmq.cpp
@ -45,6 +47,18 @@ if (GGML_OPENMP)
endif()
endif()
if (GGML_OPENMP_SIMD)
check_cxx_compiler_flag("-fopenmp-simd" SUPPORTS_OPENMP_SIMD)
if (SUPPORTS_OPENMP_SIMD)
# OpenMP_RUNTIME_MSVC=experimental / if (MSVC)
message(STATUS "Using OPENMP_SIMD.")
add_compile_definitions(GGML_USE_OPENMP_SIMD)
set(OPENMP_SIMD_FLAGS -fopenmp-simd)
else()
message(WARNING "C++ compiler lacks OPENMP_SIMD support.")
endif()
endif()
if (GGML_LLAMAFILE)
message(STATUS "Using llamafile")
@ -304,3 +318,11 @@ set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "
if (EMSCRIPTEN)
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
endif()
# FP8
if (OPENMP_SIMD_FLAGS)
# set_source_files_properties(ggml-cpu-fp8.cpp PROPERTIES COMPILE_FLAGS ${OPENMP_SIMD_FLAGS})
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS ${OPENMP_SIMD_FLAGS})
endif()

View file

@ -0,0 +1,260 @@
#include <cassert>
#include <algorithm>
#define GGML_COMMON_DECL_CPP
#include "ggml-common.h"
#include "ggml.h"
#include "ggml-cpu-fp8.h"
namespace fp8 {
union fp32_int32 {
float f;
uint32_t bits;
};
#ifdef GGML_USE_OPENMP_SIMD
#pragma omp declare simd
#endif
template<int E>
inline uint8_t from_float(float value) {
FP8<E> out;
fp32_int32 in = {value};
out.bits = (in.bits >> 24) & 0x80;
in.bits &= 0x7fffffff;
if (in.f >= FP8<E>::MAX) {
out.bits |= 0x7E;
} else if (in.f < FP8<E>::MIN) { // => 0.
} else {
in.f *= exp_f2<FP8<E>::E_BIAS-127>();
uint32_t eps = (0x3fffff>>FP8<E>::M) + ((in.bits >> (23-FP8<E>::M)) & 0x1);
in.bits += eps;
out.bits |= (in.bits >> (23-FP8<E>::M)) & 0x7F;
}
return out.bits;
}
#ifdef GGML_USE_OPENMP_SIMD
#pragma omp declare simd
#endif
template<int E>
inline float to_float(const FP8<E>& in) {
fp32_int32 out = {0};
out.bits = in.bits & 0x80;
out.bits <<= 24;
uint32_t _bits = in.bits & 0x7F;
_bits <<= (23-FP8<E>::M);
out.bits |= _bits;
out.f *= exp_f2<127-FP8<E>::E_BIAS>();
return out.f;
}
} // namespace fp8
template<int E>
static inline void conv(const float* x, FP8<E>* y, int64_t size) {
#ifdef GGML_USE_OPENMP_SIMD
#pragma omp simd
#endif
for (int64_t i=0; i<size; i++) {
y[i].bits = fp8::from_float<E>(x[i]);
}
}
template<int E>
static inline float dot(const FP8<E>* x, const float* y, int64_t size) {
float z = 0;
#ifdef GGML_USE_OPENMP_SIMD
#pragma omp simd reduction(+:z)
#endif
for (int64_t i=0; i<size; i++) {
z += fp8::to_float(x[i])*y[i];
}
return z;
}
template <int E, int QK>
struct bloc_fp8 {
float d;
FP8<E> qs[QK];
};
template <int E, int QK>
static inline void conv(const float* x, bloc_fp8<E, QK>* y, int64_t size) {
const auto qk_size = size / QK;
for (int64_t q=0; q<qk_size; ++q) {
float m = 0;
#ifdef GGML_USE_OPENMP_SIMD
// did not work on macOS and warn.
// #pragma omp simd reduction(max:m)
#endif
for (int64_t i=0; i<QK; i++) {
m = std::max(std::abs(x[q*QK+i]),m);
}
const float D = FP8<E>::MAX/m;
y[q].d = m/FP8<E>::MAX;
#ifdef GGML_USE_OPENMP_SIMD
#pragma omp simd
#endif
for (int64_t i=0; i<QK; i++) {
y[q].qs[i].bits = fp8::from_float<E>(x[q*QK+i]*D);
}
}
}
template <int E, int QK>
static inline float dot(const bloc_fp8<E, QK>* x, const float* y, int64_t size) {
float z = 0;
const auto qk_size = size / QK;
for (int64_t q=0; q<qk_size; ++q) {
float z0 = 0;
#ifdef GGML_USE_OPENMP_SIMD
#pragma omp simd reduction(+:z0)
#endif
for (int64_t i=0; i<QK; i++) {
z0 += fp8::to_float(x[q].qs[i])*y[q*QK+i];
}
z += (x[q]).d * z0;
}
return z;
}
template <int VECT_SIZE, int NB_REG, int E, int QK, typename _Y>
float dot_reg(const bloc_fp8<E, QK>* x, const _Y* y, int64_t size) {
static_assert(QK%(VECT_SIZE*NB_REG)==0, "size not supported");
using fp8_t = FP8<E>;
float z = 0;
float Z[NB_REG][VECT_SIZE];
for(int64_t r=0; r<NB_REG; ++r) {
for(int64_t v=0; v<VECT_SIZE; ++v) Z[r][v] = 0;
}
const auto qk_size = size / QK;
for (int64_t q=0; q<qk_size; ++q) {
float Z0[NB_REG][VECT_SIZE];
for(int64_t r=0; r<NB_REG; ++r) {
for(int64_t v=0; v<VECT_SIZE; ++v) Z0[r][v] = 0;
}
for (int64_t i=0; i<QK; i+=VECT_SIZE*NB_REG) {
for(int64_t r=0; r<NB_REG; ++r) {
uint8_t x_8bits[VECT_SIZE];
uint8_t sign_8bits[VECT_SIZE];
uint8_t mantice_8bits[VECT_SIZE];
uint16_t sign_16bits[VECT_SIZE];
uint16_t mantice_16bits[VECT_SIZE];
uint16_t x_bf16[VECT_SIZE];
union { uint32_t bits; float f; } ux[VECT_SIZE];
float X[VECT_SIZE];
float Y[VECT_SIZE];
for(int64_t v=0; v<VECT_SIZE; ++v) { x_8bits[v] = x[q].qs[i+r*VECT_SIZE+v].bits; }
for(int64_t v=0; v<VECT_SIZE; ++v) { sign_8bits[v] = x_8bits[v] & 0x80; }
for(int64_t v=0; v<VECT_SIZE; ++v) { mantice_8bits[v] = x_8bits[v] & 0x7F; }
for(int64_t v=0; v<VECT_SIZE; ++v) { sign_16bits[v] = sign_8bits[v]; }
for(int64_t v=0; v<VECT_SIZE; ++v) { mantice_16bits[v] = mantice_8bits[v]; }
for(int64_t v=0; v<VECT_SIZE; ++v) { sign_16bits[v] <<= 8; }
for(int64_t v=0; v<VECT_SIZE; ++v) { mantice_16bits[v] <<= (7-fp8_t::M); }
for(int64_t v=0; v<VECT_SIZE; ++v) { x_bf16[v] = sign_16bits[v] | mantice_16bits[v]; }
for(int64_t v=0; v<VECT_SIZE; ++v) { ux[v].bits = x_bf16[v]; }
for(int64_t v=0; v<VECT_SIZE; ++v) { ux[v].bits <<= 16; }
for(int64_t v=0; v<VECT_SIZE; ++v) { X[v] = ux[v].f; } // * exp_f2<127-fp8_t::E_BIAS>(); }
for(int64_t v=0; v<VECT_SIZE; ++v) { Y[v] = (float)y[q*QK+i+r*VECT_SIZE+v]; }
for(int64_t v=0; v<VECT_SIZE; ++v) { Z0[r][v] += X[v]*Y[v]; }
}
}
// apply scale
for(int64_t r=0; r<NB_REG; ++r) {
for(int64_t v=0; v<VECT_SIZE; ++v) {
Z[r][v] += Z0[r][v]*(x[q]).d * exp_f2<127-fp8_t::E_BIAS>();
}
}
}
// reduction 1
for(int64_t r=1; r<NB_REG; ++r) {
for(int64_t v=0; v<VECT_SIZE; ++v) {
Z[0][v] += Z[r][v];
}
}
// reduction 2
for(int64_t v=0; v<VECT_SIZE; ++v) {
z += Z[0][v];
}
return z;
}
// the C API.
void ggml_fp32_to_e5m2_row(const float * x, ggml_e5m2_t * y, int64_t k) {
conv(x, reinterpret_cast<FP8<5>*>(y), k);
}
void ggml_fp32_to_e4m3_row(const float * x, ggml_e4m3_t * y, int64_t k) {
conv(x, reinterpret_cast<FP8<4>*>(y), k);
}
void quantize_row_e4m3_q(const float * x, block_e4m3_q * y, int64_t k) {
assert(k % QK_K == 0);
conv(x, reinterpret_cast<bloc_fp8<4, QK_K>*>(y), k);
}
void quantize_row_e3m4_q(const float * x, block_e3m4_q * y, int64_t k) {
assert(k % QK_K == 0);
conv(x, reinterpret_cast<bloc_fp8<3, QK_K>*>(y), k);
}
// the dot product for FP8 weight
void ggml_vec_dot_e5m2(int n, float * s, size_t bs, const ggml_e5m2_t * vx, size_t bx, const float * vy, size_t by, int nrc) {
assert(nrc == 1);
GGML_UNUSED(nrc);
GGML_UNUSED(bx);
GGML_UNUSED(by);
GGML_UNUSED(bs);
*s = dot(reinterpret_cast<const FP8<5>*>(vx), vy, n);
}
void ggml_vec_dot_e4m3(int n, float * s, size_t bs, const ggml_e4m3_t * vx, size_t bx, const float * vy, size_t by, int nrc) {
assert(nrc == 1);
GGML_UNUSED(nrc);
GGML_UNUSED(bx);
GGML_UNUSED(by);
GGML_UNUSED(bs);
*s = dot(reinterpret_cast<const FP8<4>*>(vx), vy, n);
}
void ggml_vec_dot_e4m3_q(int n, float * s, size_t bs, const block_e4m3_q * vx, size_t bx, const float * vy, size_t by, int nrc) {
assert(nrc == 1);
GGML_UNUSED(nrc);
GGML_UNUSED(bx);
GGML_UNUSED(by);
GGML_UNUSED(bs);
#if defined(__AVX512F__) // 32xfloat32x16_t
*s = dot_reg<16,4>(reinterpret_cast<const bloc_fp8<4, QK_K>*>(vx), vy, n);
#elif defined(__AVX__) || defined(__AVX2__) // 16xfloat32x8_t
*s = dot_reg<8,4>(reinterpret_cast<const bloc_fp8<4, QK_K>*>(vx), vy, n);
#elif defined(__ARM_NEON) // 32xfloat32x4_t
*s = dot_reg<4,4>(reinterpret_cast<const bloc_fp8<4, QK_K>*>(vx), vy, n);
// #elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) // 32xfloat16x8_t
#else
*s = dot(reinterpret_cast<const bloc_fp8<4, QK_K>*>(vx), vy, n);
#endif
}
void ggml_vec_dot_e3m4_q(int n, float * s, size_t bs, const block_e3m4_q * vx, size_t bx, const float * vy, size_t by, int nrc) {
assert(nrc == 1);
GGML_UNUSED(nrc);
GGML_UNUSED(bx);
GGML_UNUSED(by);
GGML_UNUSED(bs);
#if defined(__AVX512F__) // 32xfloat32x16_t
*s = dot_reg<16,4>(reinterpret_cast<const bloc_fp8<3, QK_K>*>(vx), vy, n);
#elif defined(__AVX__) || defined(__AVX2__) // 16xfloat32x8_t
*s = dot_reg<8,4>(reinterpret_cast<const bloc_fp8<3, QK_K>*>(vx), vy, n);
#elif defined(__ARM_NEON) // 32xfloat32x4_t
*s = dot_reg<4,4>(reinterpret_cast<const bloc_fp8<3, QK_K>*>(vx), vy, n);
// #elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) // 32xfloat16x8_t
#else
*s = dot(reinterpret_cast<const bloc_fp8<3, QK_K>*>(vx), vy, n);
#endif
}

View file

@ -0,0 +1,21 @@
#include "ggml-fp8.h"
#ifdef __cplusplus
extern "C" {
#endif
void ggml_fp32_to_e5m2_row(const float * x, ggml_e5m2_t * y, int64_t k);
void ggml_fp32_to_e4m3_row(const float * x, ggml_e4m3_t * y, int64_t k);
void quantize_row_e4m3_q(const float * x, block_e4m3_q * y, int64_t k);
void quantize_row_e3m4_q(const float * x, block_e3m4_q * y, int64_t k);
// TODO: the best depend on the CPU fp32 / bf16 / fp16
#define GGML_FP8_VECT_DOT_TYPE GGML_TYPE_F32
void ggml_vec_dot_e5m2 (int n, float * s, size_t bs, const ggml_e5m2_t * vx, size_t bx, const float * vy, size_t by, int nrc);
void ggml_vec_dot_e4m3 (int n, float * s, size_t bs, const ggml_e4m3_t * vx, size_t bx, const float * vy, size_t by, int nrc);
void ggml_vec_dot_e4m3_q(int n, float * s, size_t bs, const block_e4m3_q * vx, size_t bx, const float * vy, size_t by, int nrc);
void ggml_vec_dot_e3m4_q(int n, float * s, size_t bs, const block_e3m4_q * vx, size_t bx, const float * vy, size_t by, int nrc);
#ifdef __cplusplus
}
#endif

View file

@ -1,11 +1,15 @@
#define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows
#define _USE_MATH_DEFINES // For M_PI on MSVC
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-cpu-aarch64.h"
#include "ggml-cpu-impl.h"
#include "ggml-cpu.h"
#include "ggml-cpu-fp8.h"
#include "ggml-impl.h"
#include "ggml-quants.h"
#include "ggml-cpu-quants.h"
@ -457,6 +461,30 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.gemv = ggml_gemv_iq4_nl_4x4_q8_0,
.gemm = ggml_gemm_iq4_nl_4x4_q8_0,
},
[GGML_TYPE_E5M2] = {
.from_float = (ggml_from_float_t) ggml_fp32_to_e5m2_row,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_e5m2,
.vec_dot_type = GGML_FP8_VECT_DOT_TYPE,
.nrows = 1,
},
[GGML_TYPE_E4M3] = {
.from_float = (ggml_from_float_t) ggml_fp32_to_e4m3_row,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_e4m3,
.vec_dot_type = GGML_FP8_VECT_DOT_TYPE,
.nrows = 1,
},
[GGML_TYPE_E4M3_Q] = {
.from_float = (ggml_from_float_t) quantize_row_e4m3_q,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_e4m3_q,
.vec_dot_type = GGML_FP8_VECT_DOT_TYPE,
.nrows = 1,
},
[GGML_TYPE_E3M4_Q] = {
.from_float = (ggml_from_float_t) quantize_row_e3m4_q,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_e3m4_q,
.vec_dot_type = GGML_FP8_VECT_DOT_TYPE,
.nrows = 1,
},
};
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
@ -4509,6 +4537,10 @@ static void ggml_compute_forward_add(
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
case GGML_TYPE_E5M2 :
case GGML_TYPE_E4M3 :
case GGML_TYPE_E4M3_Q:
case GGML_TYPE_E3M4_Q:
{
ggml_compute_forward_add_q_f32(params, dst);
} break;
@ -4889,6 +4921,10 @@ static void ggml_compute_forward_add1(
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
case GGML_TYPE_E5M2 :
case GGML_TYPE_E4M3 :
case GGML_TYPE_E4M3_Q:
case GGML_TYPE_E3M4_Q:
{
ggml_compute_forward_add1_q_f32(params, dst);
} break;
@ -4992,33 +5028,6 @@ static void ggml_compute_forward_acc(
{
ggml_compute_forward_acc_f32(params, dst);
} break;
case GGML_TYPE_F16:
case GGML_TYPE_BF16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
default:
{
GGML_ABORT("fatal error");
@ -8259,33 +8268,6 @@ static void ggml_compute_forward_set(
{
ggml_compute_forward_set_f32(params, dst);
} break;
case GGML_TYPE_F16:
case GGML_TYPE_BF16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
default:
{
GGML_ABORT("fatal error");
@ -8550,6 +8532,10 @@ static void ggml_compute_forward_get_rows(
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
case GGML_TYPE_E5M2 :
case GGML_TYPE_E4M3 :
case GGML_TYPE_E4M3_Q:
case GGML_TYPE_E3M4_Q:
{
ggml_compute_forward_get_rows_q(params, dst);
} break;
@ -9114,41 +9100,7 @@ static void ggml_compute_forward_clamp(
{
ggml_compute_forward_clamp_f32(params, dst);
} break;
case GGML_TYPE_F16:
case GGML_TYPE_BF16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q8_K:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
case GGML_TYPE_IQ4_NL_4_4:
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
case GGML_TYPE_I64:
case GGML_TYPE_F64:
case GGML_TYPE_COUNT:
default:
{
GGML_ABORT("fatal error");
}

134
ggml/src/ggml-fp8.cpp Normal file
View file

@ -0,0 +1,134 @@
#include <cassert>
#include <algorithm>
#define GGML_COMMON_DECL_CPP
#include "ggml-common.h"
#include "ggml.h"
#include "ggml-fp8.h"
union fp32_int32 {
float f;
uint32_t bits;
};
template<int E>
inline FP8<E> float_to_fp8(float value) {
FP8<E> out;
fp32_int32 in = {value};
// the sign
out.bits = (in.bits >> 24) & 0x80;
// value without sign
in.bits &= 0x7fffffff;
//GGML_ASSERT(in.bits < 0x7f800000); // +/- infinity or NAN
if (in.f >= FP8<E>::MAX) {
out.bits |= 0x7E;
} else if (in.f < FP8<E>::MIN) { // => 0.
// OK: S.0000000
} else {
in.f *= exp_f2<FP8<E>::E_BIAS-127>();
// - trunc
//uint32_t eps = 0;
// - rounding half away from zero
//uint32_t eps = 0x400000>>FP8<E>::M;
// - rounding half toward zero
//uint32_t eps = 0x3fffff>>FP8<E>::M;
// - rounding to nearest even
uint32_t eps = (0x3fffff>>FP8<E>::M) + ((in.bits >> (23-FP8<E>::M)) & 0x1);
// shift mantissa.
in.bits += eps;
out.bits |= (in.bits >> (23-FP8<E>::M)) & 0x7F;
}
return out;
}
template<int E>
inline float fp8_to_float(const FP8<E>& in) {
fp32_int32 out = {0};
out.bits = in.bits & 0x80;
out.bits <<= 24;
uint32_t _bits = in.bits & 0x7F;
_bits <<= (23-FP8<E>::M);
out.bits |= _bits;
out.f *= exp_f2<127-FP8<E>::E_BIAS>();
return out.f;
}
template<int E>
static inline void conv(const FP8<E>* x, float* y, int64_t size) {
for (int64_t i=0; i<size; i++) {
y[i] = fp8_to_float(x[i]);
}
}
template<int E>
static inline void conv(const float* x, FP8<E>* y, int64_t size) {
for (int64_t i=0; i<size; i++) {
y[i] = float_to_fp8<E>(x[i]);
}
}
template <int E, int QK>
struct bloc_fp8 {
float d;
FP8<E> qs[QK];
};
template <int E, int QK>
static inline void conv(const bloc_fp8<E, QK>* x, float* y, int64_t size) {
const auto qk_size = size / QK;
for (int64_t q=0; q<qk_size; ++q) {
for (int64_t i=0; i<QK; i++) {
y[q*QK+i] = fp8_to_float(x[q].qs[i])*(x[q].d);
}
}
}
template <int E, int QK>
static inline void conv(const float* x, bloc_fp8<E, QK>* y, int64_t size) {
const auto qk_size = size / QK;
for (int64_t q=0; q<qk_size; ++q) {
float m = 0;
for (int64_t i=0; i<QK; i++) {
m = std::max(std::abs(x[q*QK+i]),m);
}
const float D = FP8<E>::MAX/m;
y[q].d = m/FP8<E>::MAX;
for (int64_t i=0; i<QK; i++) {
y[q].qs[i] = float_to_fp8<E>(x[q*QK+i]*D);
}
}
}
// the C API.
void ggml_e5m2_to_fp32_row(const ggml_e5m2_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
conv(reinterpret_cast<const FP8<5>*>(x), y, k);
}
void ggml_fp32_to_e5m2_row_ref(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k) {
conv(x, reinterpret_cast<FP8<5>*>(y), k);
}
void ggml_e4m3_to_fp32_row(const ggml_e4m3_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
conv(reinterpret_cast<const FP8<4>*>(x), y, k);
}
void ggml_fp32_to_e4m3_row_ref(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k) {
conv(x, reinterpret_cast<FP8<4>*>(y), k);
}
void dequantize_row_e4m3_q(const block_e4m3_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
conv(reinterpret_cast<const bloc_fp8<4, QK_K>*>(x), y, k);
}
void quantize_row_e4m3_q_ref(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
conv(x, reinterpret_cast<bloc_fp8<4, QK_K>*>(y), k);
}
void dequantize_row_e3m4_q(const block_e3m4_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
conv(reinterpret_cast<const bloc_fp8<3, QK_K>*>(x), y, k);
}
void quantize_row_e3m4_q_ref(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
conv(x, reinterpret_cast<bloc_fp8<3, QK_K>*>(y), k);
}

45
ggml/src/ggml-fp8.h Normal file
View file

@ -0,0 +1,45 @@
// this is more a .inc.
#ifdef __cplusplus
template<int N>
constexpr int exp_i2() {
return 1 << N;
}
template<int N>
constexpr float exp_f2() {
if constexpr (N>0) return exp_f2<N-1>()*2;
if constexpr (N<0) return exp_f2<N+1>()/2;
if constexpr (N==0) return 1.;
}
template<int _E> //, int M=7-E> 1.7 bits!
struct FP8 {
uint8_t bits;
using type = FP8<_E>;
static constexpr int E = _E;
static constexpr int M = (7-_E);
static constexpr int E_BIAS = exp_i2<E-1>()-1;
static constexpr float MAX = (2-exp_f2<-M+1>())*exp_f2<exp_i2<E-1>()>();
static constexpr float MIN = exp_f2<-M>()*exp_f2<2-exp_i2<E-1>()>();
};
extern "C" {
#endif
// Note: types are define in ggml-common.h
GGML_API void ggml_e5m2_to_fp32_row(const ggml_e5m2_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void ggml_fp32_to_e5m2_row_ref(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k);
GGML_API void ggml_e4m3_to_fp32_row(const ggml_e4m3_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void ggml_fp32_to_e4m3_row_ref(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_e4m3_q(const block_e4m3_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_e4m3_q_ref(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_e3m4_q(const block_e3m4_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_e3m4_q_ref(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k);
#ifdef __cplusplus
}
#endif

View file

@ -5229,7 +5229,26 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
{
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
} break;
case GGML_TYPE_E4M3_Q:
case GGML_TYPE_E3M4_Q:
{
// Note realy clean, but it is the same test for E4M3.
const block_e3m4_q * q = (const block_e3m4_q *) data;
int nans = 0;
for (size_t i = 0; i < nb; ++i) {
if (!validate_float(q[i].d, i)) {
return false;
}
// NAN
for (size_t k = 0; k < QK_K; ++k) {
nans += (q[i].qs[k] & 0x7f) == 0x7f;
}
}
if (nans) {
fprintf(stderr, "%s: found %d NaNs in row of %zu FP8 values\n", __func__, nans, nb*QK_K);
return false;
}
} break;
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:

View file

@ -9,6 +9,7 @@
// FIXME: required here for quantization functions
#include "ggml-quants.h"
#include "ggml-aarch64.h"
#include "ggml-fp8.h"
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
@ -840,6 +841,38 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
.to_float = NULL,
.from_float_ref = NULL,
},
[GGML_TYPE_E5M2] = {
.type_name = "fp8_e5m2",
.blck_size = 1,
.type_size = sizeof(ggml_e5m2_t),
.is_quantized = true,
.to_float = (ggml_to_float_t) ggml_e5m2_to_fp32_row,
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_e5m2_row_ref,
},
[GGML_TYPE_E4M3] = {
.type_name = "fp8_e4m3",
.blck_size = 1,
.type_size = sizeof(ggml_e4m3_t),
.is_quantized = true,
.to_float = (ggml_to_float_t) ggml_e4m3_to_fp32_row,
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_e4m3_row_ref,
},
[GGML_TYPE_E4M3_Q] = {
.type_name = "fp8_e4m3_q",
.blck_size = QK_K,
.type_size = sizeof(block_e4m3_q),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_e4m3_q,
.from_float_ref = (ggml_from_float_t) quantize_row_e4m3_q_ref,
},
[GGML_TYPE_E3M4_Q] = {
.type_name = "fp8_e3m4_q",
.blck_size = QK_K,
.type_size = sizeof(block_e3m4_q),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_e3m4_q,
.from_float_ref = (ggml_from_float_t) quantize_row_e3m4_q_ref,
},
};
const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
@ -1271,6 +1304,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break;
case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break;
case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break;
case GGML_FTYPE_MOSTLY_E5M2: wtype = GGML_TYPE_E5M2; break;
case GGML_FTYPE_MOSTLY_E4M3: wtype = GGML_TYPE_E4M3; break;
case GGML_FTYPE_MOSTLY_E4M3_Q: wtype = GGML_TYPE_E4M3_Q; break;
case GGML_FTYPE_MOSTLY_E3M4_Q: wtype = GGML_TYPE_E3M4_Q; break;
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
}
@ -6274,6 +6311,26 @@ size_t ggml_quantize_chunk(
case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_E5M2 :
{ // move to ggml-cpu.c : type_traits[type].from_float(src + start, (char *) dst + start_row * row_size, (int64_t)nrows*n_per_row);
ggml_fp32_to_e5m2_row_ref(src + start, (ggml_e5m2_t*)((char *) dst + start_row * row_size), (int64_t)nrows*n_per_row);
result = nrows * row_size;
} break;
case GGML_TYPE_E4M3 :
{ // move to ggml-cpu.c : type_traits[type].from_float(src + start, (char *) dst + start_row * row_size, (int64_t)nrows*n_per_row);
ggml_fp32_to_e4m3_row_ref(src + start, (ggml_e4m3_t*)((char *) dst + start_row * row_size), (int64_t)nrows*n_per_row);
result = nrows * row_size;
} break;
case GGML_TYPE_E4M3_Q:
{ // move to ggml-cpu.c : type_traits[type].from_float(src + start, (char *) dst + start_row * row_size, (int64_t)nrows*n_per_row);
quantize_row_e4m3_q_ref(src + start, (block_e4m3_q*)((char *) dst + start_row * row_size), (int64_t)nrows*n_per_row);
result = nrows * row_size;
} break;
case GGML_TYPE_E3M4_Q:
{ // move to ggml-cpu.c : type_traits[type].from_float(src + start, (char *) dst + start_row * row_size, (int64_t)nrows*n_per_row);
quantize_row_e3m4_q_ref(src + start, (block_e3m4_q*)((char *) dst + start_row * row_size), (int64_t)nrows*n_per_row);
result = nrows * row_size;
} break;
case GGML_TYPE_F16:
{
size_t elemsize = sizeof(ggml_fp16_t);

View file

@ -176,6 +176,10 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
LLAMA_FTYPE_MOSTLY_E5M2 = 38, // except 1d tensors
LLAMA_FTYPE_MOSTLY_E4M3 = 39, // except 1d tensors
LLAMA_FTYPE_MOSTLY_E4M3_Q = 40, // except 1d tensors
LLAMA_FTYPE_MOSTLY_E3M4_Q = 41, // except 1d tensors
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};

View file

@ -26,7 +26,7 @@ function has_cmd {
}
if has_cmd wget; then
cmd="wget -q --show-progress -c -O %s/%s %s"
cmd="wget -q -c -O %s/%s %s"
elif has_cmd curl; then
cmd="curl -C - -f --output-dir %s -o %s -L %s"
else

View file

@ -4517,6 +4517,10 @@ struct llama_model_loader {
case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
case GGML_TYPE_E5M2: ftype = LLAMA_FTYPE_MOSTLY_E5M2; break;
case GGML_TYPE_E4M3: ftype = LLAMA_FTYPE_MOSTLY_E4M3; break;
case GGML_TYPE_E4M3_Q: ftype = LLAMA_FTYPE_MOSTLY_E4M3_Q; break;
case GGML_TYPE_E3M4_Q: ftype = LLAMA_FTYPE_MOSTLY_E3M4_Q; break;
default:
{
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@ -5283,6 +5287,10 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
case LLAMA_FTYPE_MOSTLY_E5M2: return "E5M2";
case LLAMA_FTYPE_MOSTLY_E4M3: return "E4M3";
case LLAMA_FTYPE_MOSTLY_E4M3_Q: return "E4M3_Q";
case LLAMA_FTYPE_MOSTLY_E3M4_Q: return "E3M4_Q";
default: return "unknown, may not work";
}
@ -18422,6 +18430,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
new_type = GGML_TYPE_Q5_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_E4M3_Q) {
new_type = GGML_TYPE_E4M3_Q;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_E3M4_Q) {
new_type = GGML_TYPE_E3M4_Q;
}
else if (new_type != GGML_TYPE_Q8_0) {
new_type = GGML_TYPE_Q6_K;
}
@ -18447,6 +18461,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_E4M3_Q || ftype == LLAMA_FTYPE_MOSTLY_E3M4_Q) {
new_type = tensor->type;
}
}
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
@ -18634,7 +18651,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
new_type == GGML_TYPE_IQ1_M) {
new_type == GGML_TYPE_IQ1_M || new_type == GGML_TYPE_E4M3_Q || new_type == GGML_TYPE_E3M4_Q) {
int nx = tensor->ne[0];
int ny = tensor->ne[1];
if (nx % QK_K != 0) {
@ -18661,6 +18678,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
case GGML_TYPE_E4M3_Q:
case GGML_TYPE_E3M4_Q: new_type = tensor->type; break;
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
}
if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
@ -18770,6 +18789,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
// FP8
case LLAMA_FTYPE_MOSTLY_E5M2: default_type = GGML_TYPE_E5M2; break;
case LLAMA_FTYPE_MOSTLY_E4M3: default_type = GGML_TYPE_E4M3; break;
case LLAMA_FTYPE_MOSTLY_E4M3_Q: default_type = GGML_TYPE_E4M3_Q; break;
case LLAMA_FTYPE_MOSTLY_E3M4_Q: default_type = GGML_TYPE_E3M4_Q; break;
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
}

View file

@ -88,10 +88,16 @@ static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_tr
const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
qfns_cpu->from_float(test_data1, tmp_q1.data(), test_size);
vdot->from_float(test_data2, tmp_q2.data(), test_size);
if (qfns_cpu->vec_dot_type != GGML_TYPE_F32) {
vdot->from_float(test_data2, tmp_q2.data(), test_size);
}
float result = INFINITY;
qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
if (qfns_cpu->vec_dot_type != GGML_TYPE_F32) {
qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
} else {
qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, test_data2, 0, 1);
}
const float dot_ref = dot_product(test_data1, test_data2, test_size);

View file

@ -325,7 +325,7 @@ int main(int argc, char * argv[]) {
printf("\n");
}
if (params.op_quantize_row_q_dot) {
if (params.op_quantize_row_q_dot && ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type)->from_float) {
printf(" quantize_row_q_dot\n");
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));