ggml : add IQ2 to test-backend-ops + refactoring (#4990)
* ggml : add IQ2 to test-backend-ops + refactoring ggml-ci * cuda : update supports_op for IQ2 ggml-ci * ci : enable LLAMA_CUBLAS=1 for CUDA nodes ggml-ci * cuda : fix out-of-bounds-access in `mul_mat_vec_q` ggml-ci * tests : avoid creating RNGs for each Q tensor ggml-ci * tests : avoid creating RNGs for each tensor ggml-ci
This commit is contained in:
parent
ba69bbc84c
commit
38566680cd
9 changed files with 128 additions and 87 deletions
20
ggml.h
20
ggml.h
|
@ -2065,6 +2065,18 @@ extern "C" {
|
|||
// quantization
|
||||
//
|
||||
|
||||
// - ggml_quantize_init can be called multiple times with the same type
|
||||
// it will only initialize the quantization tables for the first call or after ggml_quantize_free
|
||||
// automatically called by ggml_quantize_chunk for convenience
|
||||
//
|
||||
// - ggml_quantize_free will free any memory allocated by ggml_quantize_init
|
||||
// call this at the end of the program to avoid memory leaks
|
||||
//
|
||||
// note: these are thread-safe
|
||||
//
|
||||
GGML_API void ggml_quantize_init(enum ggml_type type);
|
||||
GGML_API void ggml_quantize_free(void);
|
||||
|
||||
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
||||
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
@ -2078,13 +2090,13 @@ extern "C" {
|
|||
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
||||
// some quantization type cannot be used without an importance matrix
|
||||
GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
|
||||
|
||||
// calls ggml_quantize_init internally (i.e. can allocate memory)
|
||||
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
|
||||
int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
|
||||
// These are needed for IQ2_XS and IQ2_XXS quantizations
|
||||
GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
|
||||
GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
|
||||
|
||||
//
|
||||
// gguf
|
||||
//
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue