Merge branch 'master' into gg/flash-attn

This commit is contained in:
Georgi Gerganov 2024-02-19 12:58:18 +02:00
commit 31109ca00a
No known key found for this signature in database
GPG key ID: BF970631944C16B7
87 changed files with 5115 additions and 1531 deletions

276
ggml.c
View file

@ -23,6 +23,9 @@
#include <limits.h>
#include <stdarg.h>
#include <signal.h>
#if defined(__gnu_linux__)
#include <syscall.h>
#endif
#ifdef GGML_USE_METAL
#include <unistd.h>
@ -673,6 +676,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
},
[GGML_TYPE_IQ1_S] = {
.type_name = "iq1_s",
.blck_size = QK_K,
.type_size = sizeof(block_iq1_s),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq1_s,
.from_float = NULL,
.from_float_reference = NULL,
.vec_dot = ggml_vec_dot_iq1_s_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
},
[GGML_TYPE_Q8_K] = {
.type_name = "q8_K",
.blck_size = QK_K,
@ -868,7 +883,7 @@ do { \
const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
_mm256_extractf128_ps(x[0], 1)); \
const __m128 t1 = _mm_hadd_ps(t0, t0); \
res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
} while (0)
// TODO: is this optimal ?
@ -1149,7 +1164,7 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
x[i] = _mm_add_ps(x[i], x[offset+i]); \
} \
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
}
// TODO: is this optimal ?
@ -2016,9 +2031,16 @@ struct ggml_numa_node {
};
struct ggml_numa_nodes {
enum ggml_numa_strategy numa_strategy;
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
uint32_t n_nodes;
uint32_t total_cpus; // hardware threads on system
uint32_t current_node; // node on which main process is execting
#if defined(__gnu_linux__)
cpu_set_t cpuset; // cpuset from numactl
#else
uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
#endif
};
//
@ -2052,18 +2074,40 @@ inline static void ggml_critical_section_end(void) {
atomic_fetch_sub(&g_state_barrier, 1);
}
void ggml_numa_init(void) {
#if defined(__gnu_linux__)
static cpu_set_t ggml_get_numa_affinity(void) {
cpu_set_t cpuset;
pthread_t thread;
thread = pthread_self();
CPU_ZERO(&cpuset);
pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
return cpuset;
}
#else
static uint32_t ggml_get_numa_affinity(void) {
return 0; // no NUMA support
}
#endif
void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
if (g_state.numa.n_nodes > 0) {
fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
return;
}
#ifdef __linux__
#if defined(__gnu_linux__)
struct stat st;
char path[256];
int rv;
// set numa scheme
g_state.numa.numa_strategy = numa_flag;
GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
g_state.numa.cpuset = ggml_get_numa_affinity();
// enumerate nodes
while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
@ -2082,11 +2126,23 @@ void ggml_numa_init(void) {
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
// figure out which node we're on
uint current_cpu;
int getcpu_ret = 0;
#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28)
getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
#else
// old glibc doesn't have a wrapper for this call. Fall back on direct syscall
getcpu_ret = syscall(SYS_getcpu,&current_cpu,&g_state.numa.current_node);
#endif
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
g_state.numa.n_nodes = 0;
return;
}
GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
struct ggml_numa_node * node = &g_state.numa.nodes[n];
GGML_PRINT_DEBUG("CPUs on node %u:", n);
@ -2113,6 +2169,7 @@ void ggml_numa_init(void) {
}
}
#else
GGML_UNUSED(numa_flag);
// TODO
#endif
}
@ -2293,6 +2350,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
}
@ -3246,7 +3304,7 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
}
struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
strncpy(tensor->name, name, sizeof(tensor->name));
strncpy(tensor->name, name, sizeof(tensor->name) - 1);
tensor->name[sizeof(tensor->name) - 1] = '\0';
return tensor;
}
@ -5124,15 +5182,27 @@ static struct ggml_tensor * ggml_soft_max_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * mask,
struct ggml_tensor * pos,
float scale,
float max_bias,
bool inplace) {
GGML_ASSERT(ggml_is_contiguous(a));
if (mask) {
GGML_ASSERT(mask->type == GGML_TYPE_F16);
GGML_ASSERT(ggml_is_contiguous(mask));
GGML_ASSERT(mask->ne[2] == 1);
GGML_ASSERT(mask->ne[3] == 1);
GGML_ASSERT(mask->ne[1] >= a->ne[1]);
GGML_ASSERT(ggml_is_matrix(mask));
GGML_ASSERT(ggml_can_repeat_rows(mask, a));
}
if (pos) {
GGML_ASSERT(ggml_is_vector(pos));
GGML_ASSERT(pos->type == GGML_TYPE_F16);
GGML_ASSERT(pos->ne[0] == a->ne[0]);
}
if (max_bias > 0.0f) {
GGML_ASSERT(pos);
}
bool is_node = false;
@ -5143,13 +5213,14 @@ static struct ggml_tensor * ggml_soft_max_impl(
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
float params[] = { scale };
float params[] = { scale, max_bias };
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_SOFT_MAX;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = mask;
result->src[2] = pos;
return result;
}
@ -5157,21 +5228,23 @@ static struct ggml_tensor * ggml_soft_max_impl(
struct ggml_tensor * ggml_soft_max(
struct ggml_context * ctx,
struct ggml_tensor * a) {
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false);
}
struct ggml_tensor * ggml_soft_max_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a) {
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true);
}
struct ggml_tensor * ggml_soft_max_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * mask,
float scale) {
return ggml_soft_max_impl(ctx, a, mask, scale, false);
struct ggml_tensor * pos,
float scale,
float max_bias) {
return ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false);
}
// ggml_soft_max_back
@ -7744,6 +7817,7 @@ static void ggml_compute_forward_add(
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ1_S:
{
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
} break;
@ -8011,6 +8085,7 @@ static void ggml_compute_forward_add1(
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ1_S:
{
ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
} break;
@ -8131,6 +8206,7 @@ static void ggml_compute_forward_acc(
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ1_S:
default:
{
GGML_ASSERT(false);
@ -10897,6 +10973,7 @@ static void ggml_compute_forward_out_prod(
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ1_S:
{
ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
} break;
@ -11077,6 +11154,7 @@ static void ggml_compute_forward_set(
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ1_S:
default:
{
GGML_ASSERT(false);
@ -11274,6 +11352,7 @@ static void ggml_compute_forward_get_rows(
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ1_S:
{
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
} break;
@ -11577,6 +11656,7 @@ static void ggml_compute_forward_soft_max_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const struct ggml_tensor * src2,
struct ggml_tensor * dst) {
assert(ggml_is_contiguous(dst));
assert(ggml_are_same_shape(src0, dst));
@ -11585,16 +11665,29 @@ static void ggml_compute_forward_soft_max_f32(
return;
}
float scale = 1.0f;
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
float scale = 1.0f;
float max_bias = 0.0f;
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
// TODO: handle transposed/permuted matrices
const int ith = params->ith;
const int nth = params->nth;
GGML_TENSOR_UNARY_OP_LOCALS
const int64_t ne11 = src1 ? src1->ne[1] : 1;
// TODO: is this supposed to be ceil instead of floor?
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
const uint32_t n_head_kv = ne02;
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
const int nc = src0->ne[0];
const int nr = ggml_nrows(src0);
@ -11607,6 +11700,9 @@ static void ggml_compute_forward_soft_max_f32(
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
// when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
float * pos = src2 ? (float *) src2->data : src0->data;
for (int i1 = ir0; i1 < ir1; i1++) {
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
@ -11622,6 +11718,16 @@ static void ggml_compute_forward_soft_max_f32(
}
}
// ALiBi bias
if (max_bias > 0.0f) {
const uint32_t h = (i1/ne01)%ne02; // head
const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
for (int i = 0; i < nc; i++) {
wp[i] = wp[i] + slope*pos[i];
}
}
#ifndef NDEBUG
for (int i = 0; i < nc; ++i) {
//printf("p[%d] = %f\n", i, p[i]);
@ -11666,11 +11772,12 @@ static void ggml_compute_forward_soft_max(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const struct ggml_tensor * src2,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
ggml_compute_forward_soft_max_f32(params, src0, src1, src2, dst);
} break;
default:
{
@ -11814,22 +11921,20 @@ static void ggml_compute_forward_alibi_f32(
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
for (int64_t i = 0; i < ne0; i++) {
for (int64_t j = 0; j < ne1; j++) {
for (int64_t k = 0; k < ne2_ne3; k++) {
for (int64_t k = 0; k < ne2_ne3; k++) {
// TODO: k*nb2 or k*nb3
float m_k;
if (k < n_heads_log2_floor) {
m_k = powf(m0, k + 1);
} else {
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
}
for (int64_t i = 0; i < ne0; i++) {
for (int64_t j = 0; j < ne1; j++) {
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
// TODO: k*nb2 or k*nb3
float m_k;
if (k < n_heads_log2_floor) {
m_k = powf(m0, k + 1);
} else {
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
}
pdst[0] = i * m_k + src[0];
}
}
@ -11874,21 +11979,20 @@ static void ggml_compute_forward_alibi_f16(
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
for (int i = 0; i < ne0; i++) {
for (int j = 0; j < ne1; j++) {
for (int k = 0; k < ne2_ne3; k++) {
for (int k = 0; k < ne2_ne3; k++) {
// TODO: k*nb2 or k*nb3
float m_k;
if (k < n_heads_log2_floor) {
m_k = powf(m0, k + 1);
} else {
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
}
for (int i = 0; i < ne0; i++) {
for (int j = 0; j < ne1; j++) {
ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
// TODO: k*nb2 or k*nb3
float m_k;
if (k < n_heads_log2_floor) {
m_k = powf(m0, k + 1);
} else {
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
}
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
// we return F32
pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
@ -11924,6 +12028,7 @@ static void ggml_compute_forward_alibi(
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_Q8_K:
case GGML_TYPE_I8:
case GGML_TYPE_I16:
@ -12001,6 +12106,7 @@ static void ggml_compute_forward_clamp(
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_Q8_K:
case GGML_TYPE_I8:
case GGML_TYPE_I16:
@ -15391,7 +15497,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
} break;
case GGML_OP_SOFT_MAX:
{
ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
} break;
case GGML_OP_SOFT_MAX_BACK:
{
@ -16953,27 +17059,47 @@ typedef pthread_t ggml_thread_t;
#endif
// Android's libc implementation "bionic" does not support setting affinity
#if defined(__linux__) && !defined(__BIONIC__)
static void set_numa_thread_affinity(int thread_n, int n_threads) {
#if defined(__gnu_linux__)
static void set_numa_thread_affinity(int thread_n) {
if (!ggml_is_numa()) {
return;
}
// run thread on node_num thread_n / (threads per node)
const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
int node_num;
int rv;
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
switch(g_state.numa.numa_strategy) {
case GGML_NUMA_STRATEGY_DISTRIBUTE:
// run thread on node_num thread_n / (threads per node)
node_num = thread_n % g_state.numa.n_nodes;
break;
case GGML_NUMA_STRATEGY_ISOLATE:
// run thread on current_node
node_num = g_state.numa.current_node;
break;
case GGML_NUMA_STRATEGY_NUMACTL:
// use the cpuset that numactl gave us
rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
if (rv) {
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
}
return;
default:
return;
}
struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
CPU_ZERO_S(setsize, cpus);
for (size_t i = 0; i < node->n_cpus; ++i) {
CPU_SET_S(node->cpus[i], setsize, cpus);
}
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
if (rv) {
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
strerror(rv));
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
}
CPU_FREE(cpus);
@ -16994,8 +17120,7 @@ static void clear_numa_thread_affinity(void) {
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
if (rv) {
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
strerror(rv));
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
}
CPU_FREE(cpus);
@ -17003,7 +17128,7 @@ static void clear_numa_thread_affinity(void) {
#else
// TODO: Windows etc.
// (the linux implementation may also work on BSD, someone should test)
static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
static void clear_numa_thread_affinity(void) {}
#endif
@ -17304,7 +17429,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
const int n_threads = state->shared->n_threads;
set_numa_thread_affinity(state->ith, n_threads);
set_numa_thread_affinity(state->ith);
int node_n = -1;
int task_phase = GGML_TASK_FINALIZE;
@ -18116,7 +18241,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
ptr += ggml_nbytes(tensor);
fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
fprintf(stderr, "%s: loaded leaf %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
}
}
@ -18219,7 +18344,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
result->nodes[i] = tensor;
fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
fprintf(stderr, "%s: loaded node %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
}
}
}
@ -18844,7 +18969,9 @@ static enum ggml_opt_result linesearch_backtracking(
(*step) *= width;
}
GGML_UNREACHABLE();
GGML_ASSERT(false && "line search failed");
return GGML_LINESEARCH_FAIL;
}
static enum ggml_opt_result ggml_opt_lbfgs(
@ -19112,7 +19239,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
step[0] = 1.0;
}
GGML_UNREACHABLE();
GGML_ASSERT(false && "lbfgs failed");
return GGML_OPT_DID_NOT_CONVERGE;
}
struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
@ -19360,8 +19489,9 @@ void ggml_quantize_init(enum ggml_type type) {
ggml_critical_section_start();
switch (type) {
case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break;
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
default: // nothing
break;
@ -19373,8 +19503,10 @@ void ggml_quantize_init(enum ggml_type type) {
void ggml_quantize_free(void) {
ggml_critical_section_start();
iq2xs_free_impl(256);
iq2xs_free_impl(512);
iq2xs_free_impl(GGML_TYPE_IQ2_XXS);
iq2xs_free_impl(GGML_TYPE_IQ2_XS);
iq2xs_free_impl(GGML_TYPE_IQ1_S);
iq3xs_free_impl(256);
ggml_critical_section_end();
}
@ -19509,7 +19641,8 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
bool ggml_quantize_requires_imatrix(enum ggml_type type) {
return
type == GGML_TYPE_IQ2_XXS ||
type == GGML_TYPE_IQ2_XS;
type == GGML_TYPE_IQ2_XS ||
type == GGML_TYPE_IQ1_S;
}
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
@ -19634,6 +19767,15 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
GGML_ASSERT(result == row_size * nrows);
} break;
case GGML_TYPE_IQ1_S:
{
GGML_ASSERT(start % QK_K == 0);
GGML_ASSERT(start % n_per_row == 0);
size_t start_row = start / n_per_row;
size_t row_size = ggml_row_size(type, n_per_row);
result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
GGML_ASSERT(result == row_size * nrows);
} break;
case GGML_TYPE_F16:
{
size_t elemsize = sizeof(ggml_fp16_t);