Merge branch 'ggerganov:master' into master
This commit is contained in:
commit
c7f7f13650
5 changed files with 98 additions and 43 deletions
|
@ -93,6 +93,7 @@ as the main playground for developing new features for the [ggml](https://github
|
|||
- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
|
||||
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
||||
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
|
||||
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
|
||||
|
||||
**UI:**
|
||||
|
||||
|
@ -687,6 +688,8 @@ GGML_OPENCL_DEVICE=0
|
|||
export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
|
||||
```
|
||||
|
||||
(Note: some Android devices, like the Zenfone 8, need the following command instead - "export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH". Source: https://www.reddit.com/r/termux/comments/kc3ynp/opencl_working_in_termux_more_in_comments/ )
|
||||
|
||||
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
|
||||
|
||||
Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.
|
||||
|
|
|
@ -294,20 +294,9 @@ void init_model(struct my_llama_model * model) {
|
|||
|
||||
ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
|
||||
|
||||
// 'layers.10.feed_forward.w1.weight' has length of 32.
|
||||
// ggml_tensor->name only has 32 characters, but we need one more for the '\0' terminator.
|
||||
// ggml_set_name will set the last character to '\0', so we can only store 'layers.10.feed_forward.w1.weigh'.
|
||||
// when saving llama compatible model the tensors names will miss a character.
|
||||
// ggml_set_name(layer.w1, (layers_i + ".feed_forward.w1.weight").c_str());
|
||||
// ggml_set_name(layer.w2, (layers_i + ".feed_forward.w2.weight").c_str());
|
||||
// ggml_set_name(layer.w3, (layers_i + ".feed_forward.w3.weight").c_str());
|
||||
|
||||
strncpy(layer.w1->name, (layers_i + ".feed_forward.w1.weight").c_str(), sizeof(layer.w1->name));
|
||||
strncpy(layer.w2->name, (layers_i + ".feed_forward.w2.weight").c_str(), sizeof(layer.w2->name));
|
||||
strncpy(layer.w3->name, (layers_i + ".feed_forward.w3.weight").c_str(), sizeof(layer.w3->name));
|
||||
layer.w1->padding[0] = 0;
|
||||
layer.w2->padding[0] = 0;
|
||||
layer.w3->padding[0] = 0;
|
||||
ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
|
||||
ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
|
||||
ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -454,8 +443,8 @@ struct ggml_tensor * forward(
|
|||
// wk shape [n_embd, n_embd, 1, 1]
|
||||
// Qcur shape [n_embd/n_head, n_head, N, 1]
|
||||
// Kcur shape [n_embd/n_head, n_head, N, 1]
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
|
||||
// store key and value to memory
|
||||
{
|
||||
|
@ -711,8 +700,8 @@ struct ggml_tensor * forward_batch(
|
|||
// wk shape [n_embd, n_embd, 1, 1]
|
||||
// Qcur shape [n_embd/n_head, n_head, N, n_batch]
|
||||
// Kcur shape [n_embd/n_head, n_head, N, n_batch]
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
|
||||
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
|
||||
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
|
||||
|
||||
|
@ -996,8 +985,8 @@ struct ggml_tensor * forward_batch_wo_cache(
|
|||
// wk shape [n_embd, n_embd, 1, 1]
|
||||
// Qcur shape [n_embd/n_head, n_head, N, n_batch]
|
||||
// Kcur shape [n_embd/n_head, n_head, N, n_batch]
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
|
||||
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
|
||||
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
|
||||
|
||||
|
@ -1218,8 +1207,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
|
|||
// compute Q and K and RoPE them
|
||||
// wq shape [n_embd, n_embd, 1, 1]
|
||||
// wk shape [n_embd, n_embd, 1, 1]
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
|
||||
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
|
||||
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
|
||||
|
||||
|
@ -1618,10 +1607,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
|||
use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, 0)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat (ctx0, layer.wk, t04)); assert_shape_2d(t08, n_embd, N*n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, 0)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat (ctx0, t04, layer.wv)); assert_shape_2d(t11, N*n_batch, n_embd);
|
||||
use_buf(-1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
|
||||
use_buf(-1); struct ggml_tensor * t13 = expand(gf, ggml_permute (ctx0, t07, 0, 2, 1, 3)); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
|
||||
|
@ -2368,7 +2357,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
|
|||
file->write_u32(0);
|
||||
file->write_u32(0);
|
||||
file->write_u32(GGML_TYPE_F32);
|
||||
file->seek(0-file->tell() & 31, SEEK_CUR);
|
||||
file->seek((0-file->tell()) & 31, SEEK_CUR);
|
||||
return;
|
||||
}
|
||||
const char * name = ggml_get_name(tensor);
|
||||
|
@ -2383,7 +2372,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
|
|||
file->write_u32(tensor->type);
|
||||
file->write_raw(ne, sizeof(ne[0]) * nd);
|
||||
file->write_raw(name, name_len);
|
||||
file->seek(0-file->tell() & 31, SEEK_CUR);
|
||||
file->seek((0-file->tell()) & 31, SEEK_CUR);
|
||||
file->write_raw(tensor->data, ggml_nbytes(tensor));
|
||||
}
|
||||
|
||||
|
@ -2404,7 +2393,7 @@ void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
|
|||
std::string name = file->read_string(name_len);
|
||||
GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
|
||||
|
||||
file->seek(0-file->tell() & 31, SEEK_CUR);
|
||||
file->seek((0-file->tell()) & 31, SEEK_CUR);
|
||||
file->read_raw(tensor->data, ggml_nbytes(tensor));
|
||||
}
|
||||
|
||||
|
|
82
ggml.c
82
ggml.c
|
@ -6778,6 +6778,7 @@ struct ggml_tensor * ggml_rope_impl(
|
|||
int n_past,
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx,
|
||||
bool inplace) {
|
||||
GGML_ASSERT(n_past >= 0);
|
||||
bool is_node = false;
|
||||
|
@ -6790,11 +6791,12 @@ struct ggml_tensor * ggml_rope_impl(
|
|||
|
||||
ggml_scratch_save(ctx);
|
||||
|
||||
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
||||
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
|
||||
|
||||
((int32_t *) b->data)[0] = n_past;
|
||||
((int32_t *) b->data)[1] = n_dims;
|
||||
((int32_t *) b->data)[2] = mode;
|
||||
((int32_t *) b->data)[3] = n_ctx;
|
||||
|
||||
ggml_scratch_load(ctx);
|
||||
|
||||
|
@ -6811,8 +6813,9 @@ struct ggml_tensor * ggml_rope(
|
|||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false);
|
||||
int mode,
|
||||
int n_ctx) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_rope_inplace(
|
||||
|
@ -6820,8 +6823,9 @@ struct ggml_tensor * ggml_rope_inplace(
|
|||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true);
|
||||
int mode,
|
||||
int n_ctx) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
|
||||
}
|
||||
|
||||
// ggml_rope_back
|
||||
|
@ -12440,7 +12444,7 @@ static void ggml_compute_forward_rope_f32(
|
|||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_nelements(src1) == 3);
|
||||
GGML_ASSERT(ggml_nelements(src1) == 4);
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
|
@ -12449,6 +12453,7 @@ static void ggml_compute_forward_rope_f32(
|
|||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const int n_dims = ((int32_t *) src1->data)[1];
|
||||
const int mode = ((int32_t *) src1->data)[2];
|
||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||
|
||||
assert(n_past >= 0);
|
||||
|
||||
|
@ -12493,6 +12498,7 @@ static void ggml_compute_forward_rope_f32(
|
|||
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
const bool is_glm = mode & 4;
|
||||
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
||||
|
@ -12503,7 +12509,32 @@ static void ggml_compute_forward_rope_f32(
|
|||
|
||||
float theta = (float)p;
|
||||
|
||||
if (!is_neox) {
|
||||
if (is_glm) {
|
||||
theta = MIN(p, n_ctx - 2);
|
||||
float block_theta = MAX(p - (n_ctx - 2), 0);
|
||||
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
const float cos_block_theta = cosf(block_theta);
|
||||
const float sin_block_theta = sinf(block_theta);
|
||||
|
||||
theta *= theta_scale;
|
||||
block_theta *= theta_scale;
|
||||
|
||||
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[n_dims/2];
|
||||
const float x2 = src[n_dims];
|
||||
const float x3 = src[n_dims/2*3];
|
||||
|
||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
|
||||
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
|
||||
}
|
||||
} else if (!is_neox) {
|
||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
|
@ -12553,7 +12584,7 @@ static void ggml_compute_forward_rope_f16(
|
|||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_nelements(src1) == 3);
|
||||
GGML_ASSERT(ggml_nelements(src1) == 4);
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
|
@ -12562,6 +12593,7 @@ static void ggml_compute_forward_rope_f16(
|
|||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const int n_dims = ((int32_t *) src1->data)[1];
|
||||
const int mode = ((int32_t *) src1->data)[2];
|
||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||
|
||||
assert(n_past >= 0);
|
||||
|
||||
|
@ -12606,6 +12638,7 @@ static void ggml_compute_forward_rope_f16(
|
|||
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
const bool is_glm = mode & 4;
|
||||
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
||||
|
@ -12616,7 +12649,32 @@ static void ggml_compute_forward_rope_f16(
|
|||
|
||||
float theta = (float)p;
|
||||
|
||||
if (!is_neox) {
|
||||
if (is_glm) {
|
||||
theta = MIN(p, n_ctx - 2);
|
||||
float block_theta = MAX(p - (n_ctx - 2), 0);
|
||||
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
const float cos_block_theta = cosf(block_theta);
|
||||
const float sin_block_theta = sinf(block_theta);
|
||||
|
||||
theta *= theta_scale;
|
||||
block_theta *= theta_scale;
|
||||
|
||||
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
||||
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
||||
const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
|
||||
const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
|
||||
|
||||
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
||||
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
||||
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
|
||||
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
||||
}
|
||||
} if (!is_neox) {
|
||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
|
@ -16189,17 +16247,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||
{
|
||||
if (src0->grad) {
|
||||
assert(src1->type == GGML_TYPE_I32);
|
||||
assert(ggml_nelements(src1) == 3);
|
||||
assert(ggml_nelements(src1) == 4);
|
||||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const int n_dims = ((int32_t *) src1->data)[1];
|
||||
const int mode = ((int32_t *) src1->data)[2];
|
||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||
src0->grad = ggml_add_impl(ctx,
|
||||
src0->grad,
|
||||
ggml_rope(ctx,
|
||||
tensor->grad,
|
||||
n_past,
|
||||
n_dims,
|
||||
mode),
|
||||
mode,
|
||||
n_ctx),
|
||||
inplace);
|
||||
}
|
||||
if (src1->grad) {
|
||||
|
|
9
ggml.h
9
ggml.h
|
@ -198,7 +198,7 @@
|
|||
#define GGML_MAX_PARAMS 256
|
||||
#define GGML_MAX_CONTEXTS 64
|
||||
#define GGML_MAX_OPT 4
|
||||
#define GGML_MAX_NAME 32
|
||||
#define GGML_MAX_NAME 48
|
||||
#define GGML_DEFAULT_N_THREADS 4
|
||||
|
||||
#define GGML_ASSERT(x) \
|
||||
|
@ -1036,13 +1036,15 @@ extern "C" {
|
|||
// rotary position embedding
|
||||
// if mode & 1 == 1, skip n_past elements
|
||||
// if mode & 2 == 1, GPT-NeoX style
|
||||
// if mode & 4 == 1, ChatGLM style
|
||||
// TODO: avoid creating a new tensor every time
|
||||
GGML_API struct ggml_tensor * ggml_rope(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode);
|
||||
int mode,
|
||||
int n_ctx);
|
||||
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
||||
|
@ -1050,7 +1052,8 @@ extern "C" {
|
|||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode);
|
||||
int mode,
|
||||
int n_ctx);
|
||||
|
||||
// rotary position embedding backward, i.e compute dx from dy
|
||||
// a - dy
|
||||
|
|
|
@ -1491,11 +1491,11 @@ static bool llama_eval_internal(
|
|||
offload_func_kq(tmpq);
|
||||
ggml_set_name(tmpq, "tmpq");
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
offload_func_kq(Kcur);
|
||||
ggml_set_name(Kcur, "Kcur");
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
offload_func_kq(Qcur);
|
||||
ggml_set_name(Qcur, "Qcur");
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue