llama : add Mixtral support (#4406)
* convert : support Mixtral as LLAMA arch
* convert : fix n_ff typo
* llama : model loading
* ggml : sync latest ggml_mul_mat_id
* llama : update graph to support MoE
* llama : fix cur -> cur_expert
* llama : first working version
* llama : fix expert weighting in the FFN
* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)
* ggml : add n_as argument to ggml_mul_mat_id
* ggml : fix ggml_get_rows to take into account ne02 / ne11
* metal : add more general support for ggml_get_rows + tests
* llama : add basic support for offloading moe with CUDA
* metal : add/mul/div use general kernel when src1 not cont
* metal : reduce the kernel launches for ggml_mul_mat_id
* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D
* ggml : update get_rows f16 and q
* cuda : support non-contiguous src1 in get_rows
* llama : offload missing ffn_moe_silu
* metal : fix ggml_get_rows to work with non-cont src1
* metal : add indirect mat-vec kernels for all quantization types
* llama : do not quantize expert gating tensors
* llama : add n_expert and n_expert_used to hparams + change quants
* test-backend-ops : add moe test
* cuda : fix get_rows when ncols is odd
* convert : determine n_ctx correctly
* metal : fix ggml_mul_mat_id for F32
* test-backend-ops : make experts more evenly probable (test_moe)
* test-backend-ops : cleanup, add moe test for batches
* test-backend-ops : add cpy from f32 -> all types test
* test-backend-ops : fix dequantize block offset
* llama : fix hard-coded number of experts
* test-backend-ops : simplify and disable slow tests to avoid CI timeout
* test-backend-ops : disable MOE test with thread sanitizer
* cuda : fix mul_mat_id with multi gpu
* convert : use 1e6 rope_freq_base for mixtral
* convert : fix style
* convert : support safetensors format
* gguf-py : bump version
* metal : add cpy f16 -> f32 kernel
* metal : fix binary ops for ne10 % 4 != 0
* test-backend-ops : add one more sum_rows test
* ggml : do not use BLAS with ggml_mul_mat_id
* convert-hf : support for mixtral-instruct (#4428)
* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct
* convert : use sentencepiece tokenizer for Mixtral-instruct
* convert : make flake8 happy
* metal : fix soft_max kernels
ref: 1914017863
* metal : limit kernels to not use more than the allowed threads
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
This commit is contained in:
parent
fecac45658
commit
799a1cb13b
14 changed files with 2370 additions and 395 deletions
168
ggml.c
168
ggml.c
|
@ -4075,17 +4075,18 @@ struct ggml_tensor * ggml_mul_mat(
|
|||
|
||||
struct ggml_tensor * ggml_mul_mat_id(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * as[],
|
||||
struct ggml_tensor * const as[],
|
||||
int n_as,
|
||||
struct ggml_tensor * ids,
|
||||
int id,
|
||||
struct ggml_tensor * b) {
|
||||
|
||||
int64_t n_as = ids->ne[0];
|
||||
|
||||
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_is_vector(ids));
|
||||
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
|
||||
GGML_ASSERT(ids->ne[1] == b->ne[1]);
|
||||
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
|
||||
GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
|
||||
GGML_ASSERT(id >= 0 && id < n_as);
|
||||
GGML_ASSERT(id >= 0 && id < ids->ne[0]);
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
|
@ -4097,13 +4098,14 @@ struct ggml_tensor * ggml_mul_mat_id(
|
|||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
|
||||
|
||||
ggml_set_op_params_i32(result, 0, id);
|
||||
ggml_set_op_params_i32(result, 1, n_as);
|
||||
|
||||
result->op = GGML_OP_MUL_MAT_ID;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = ids;
|
||||
result->src[1] = b;
|
||||
|
||||
for (int64_t i = 0; i < n_as; i++) {
|
||||
for (int i = 0; i < n_as; i++) {
|
||||
struct ggml_tensor * a = as[i];
|
||||
GGML_ASSERT(ggml_are_same_shape(as[0], a));
|
||||
GGML_ASSERT(ggml_can_mul_mat(a, b));
|
||||
|
@ -4731,7 +4733,9 @@ struct ggml_tensor * ggml_get_rows(
|
|||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b) {
|
||||
GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(a->ne[2] == b->ne[1]);
|
||||
GGML_ASSERT(b->ne[3] == 1);
|
||||
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
|
@ -4741,7 +4745,7 @@ struct ggml_tensor * ggml_get_rows(
|
|||
|
||||
// TODO: implement non F32 return
|
||||
//struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
|
||||
struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0]);
|
||||
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
|
||||
|
||||
result->op = GGML_OP_GET_ROWS;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
|
@ -9504,8 +9508,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|||
const int64_t ne0 = dst->ne[0];
|
||||
const int64_t ne1 = dst->ne[1];
|
||||
|
||||
// NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
|
||||
// all the experts for each batch element and the processing would become incredibly slow
|
||||
// TODO: find the optimal values for these
|
||||
if (ggml_is_contiguous(src0) &&
|
||||
if (dst->op != GGML_OP_MUL_MAT_ID &&
|
||||
ggml_is_contiguous(src0) &&
|
||||
ggml_is_contiguous(src1) &&
|
||||
//src0->type == GGML_TYPE_F32 &&
|
||||
src1->type == GGML_TYPE_F32 &&
|
||||
|
@ -9519,11 +9526,16 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|||
}
|
||||
#endif
|
||||
|
||||
// off1 = offset in i11 and i1
|
||||
// cne1 = ne11 and ne1
|
||||
// in a normal matrix multiplication, off1 = 0 and cne1 = ne1
|
||||
// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
|
||||
static void ggml_compute_forward_mul_mat(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
struct ggml_tensor * dst,
|
||||
int64_t off1, int64_t cne1) {
|
||||
int64_t t0 = ggml_perf_time_us();
|
||||
UNUSED(t0);
|
||||
|
||||
|
@ -9591,10 +9603,9 @@ static void ggml_compute_forward_mul_mat(
|
|||
const int64_t i03 = i13/r3;
|
||||
const int64_t i02 = i12/r2;
|
||||
|
||||
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
||||
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
||||
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
||||
const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13);
|
||||
float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3);
|
||||
|
||||
if (type != GGML_TYPE_F32) {
|
||||
float * const wdata = params->wdata;
|
||||
|
@ -9611,10 +9622,10 @@ static void ggml_compute_forward_mul_mat(
|
|||
}
|
||||
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne00,
|
||||
0.0f, d, ne01);
|
||||
cne1, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne00,
|
||||
0.0f, d, ne01);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -9630,6 +9641,7 @@ static void ggml_compute_forward_mul_mat(
|
|||
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
||||
|
||||
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
||||
assert(src1->type == GGML_TYPE_F32);
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||
|
@ -9652,7 +9664,7 @@ static void ggml_compute_forward_mul_mat(
|
|||
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
||||
|
||||
const int64_t nr0 = ne01; // src0 rows
|
||||
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
|
||||
const int64_t nr1 = cne1*ne12*ne13; // src1 rows
|
||||
|
||||
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
||||
|
||||
|
@ -9694,9 +9706,9 @@ static void ggml_compute_forward_mul_mat(
|
|||
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
||||
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
||||
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
||||
const int64_t i13 = (ir1/(ne12*ne11));
|
||||
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
|
||||
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
|
||||
const int64_t i13 = (ir1/(ne12*cne1));
|
||||
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
|
||||
const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1;
|
||||
|
||||
// broadcast src0 into src1
|
||||
const int64_t i03 = i13/r3;
|
||||
|
@ -9736,20 +9748,28 @@ static void ggml_compute_forward_mul_mat(
|
|||
|
||||
static void ggml_compute_forward_mul_mat_id(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * ids = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
// during GGML_TASK_INIT the entire src1 is converted to vec_dot_type
|
||||
ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
|
||||
return;
|
||||
}
|
||||
|
||||
const int id = ggml_get_op_params_i32(dst, 0);
|
||||
const struct ggml_tensor * ids = src0;
|
||||
const int id = ggml_get_op_params_i32(dst, 0);
|
||||
const int n_as = ggml_get_op_params_i32(dst, 1);
|
||||
|
||||
const int a_id = ((int32_t *)ids->data)[id];
|
||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||
const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
|
||||
|
||||
GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
|
||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[a_id + 2];
|
||||
|
||||
ggml_compute_forward_mul_mat(params, src0, src1, dst);
|
||||
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
||||
ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_out_prod
|
||||
|
@ -10325,21 +10345,30 @@ static void ggml_compute_forward_get_rows_q(
|
|||
return;
|
||||
}
|
||||
|
||||
const int nc = src0->ne[0];
|
||||
const int nr = ggml_nelements(src1);
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const int64_t nc = ne00;
|
||||
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
|
||||
|
||||
const enum ggml_type type = src0->type;
|
||||
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
|
||||
|
||||
assert( dst->ne[0] == nc);
|
||||
assert( dst->ne[1] == nr);
|
||||
assert(src0->nb[0] == ggml_type_size(type));
|
||||
assert(ne0 == nc);
|
||||
assert(ne02 == ne11);
|
||||
assert(nb00 == ggml_type_size(type));
|
||||
assert(ggml_nrows(dst) == nr);
|
||||
|
||||
for (int i = 0; i < nr; ++i) {
|
||||
const int r = ((int32_t *) src1->data)[i];
|
||||
// TODO: multi-thread
|
||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
||||
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
||||
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
||||
|
||||
dequantize_row_q(
|
||||
(const void *) ((char *) src0->data + r*src0->nb[1]),
|
||||
(float *) ((char *) dst->data + i*dst->nb[1]), nc);
|
||||
dequantize_row_q(
|
||||
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
||||
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -10354,19 +10383,26 @@ static void ggml_compute_forward_get_rows_f16(
|
|||
return;
|
||||
}
|
||||
|
||||
const int nc = src0->ne[0];
|
||||
const int nr = ggml_nelements(src1);
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
assert( dst->ne[0] == nc);
|
||||
assert( dst->ne[1] == nr);
|
||||
assert(src0->nb[0] == sizeof(ggml_fp16_t));
|
||||
const int64_t nc = ne00;
|
||||
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
|
||||
|
||||
for (int i = 0; i < nr; ++i) {
|
||||
const int r = ((int32_t *) src1->data)[i];
|
||||
assert(ne0 == nc);
|
||||
assert(ne02 == ne11);
|
||||
assert(nb00 == sizeof(ggml_fp16_t));
|
||||
assert(ggml_nrows(dst) == nr);
|
||||
|
||||
for (int j = 0; j < nc; ++j) {
|
||||
ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
|
||||
((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v);
|
||||
// TODO: multi-thread
|
||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
||||
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
||||
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
||||
|
||||
ggml_fp16_to_fp32_row(
|
||||
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
||||
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -10382,19 +10418,27 @@ static void ggml_compute_forward_get_rows_f32(
|
|||
return;
|
||||
}
|
||||
|
||||
const int nc = src0->ne[0];
|
||||
const int nr = ggml_nelements(src1);
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
assert( dst->ne[0] == nc);
|
||||
assert( dst->ne[1] == nr);
|
||||
assert(src0->nb[0] == sizeof(float));
|
||||
const int64_t nc = ne00;
|
||||
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
|
||||
|
||||
for (int i = 0; i < nr; ++i) {
|
||||
const int r = ((int32_t *) src1->data)[i];
|
||||
assert(ne0 == nc);
|
||||
assert(ne02 == ne11);
|
||||
assert(nb00 == sizeof(float));
|
||||
assert(ggml_nrows(dst) == nr);
|
||||
|
||||
ggml_vec_cpy_f32(nc,
|
||||
(float *) ((char *) dst->data + i*dst->nb[1]),
|
||||
(float *) ((char *) src0->data + r*src0->nb[1]));
|
||||
// TODO: multi-thread
|
||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
||||
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
||||
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
||||
|
||||
ggml_vec_cpy_f32(nc,
|
||||
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
||||
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -14037,11 +14081,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||
} break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
||||
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]);
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
ggml_compute_forward_mul_mat_id(params, tensor);
|
||||
ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
|
||||
} break;
|
||||
case GGML_OP_OUT_PROD:
|
||||
{
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue