add mrope unit test, fix few compiler warnings

This commit is contained in:
HimariO 2024-12-08 00:47:48 +08:00
parent 6c39aa38f5
commit ac2089c378
5 changed files with 74 additions and 32 deletions

View file

@ -2488,7 +2488,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int patch_size = hparams.patch_size; const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 3 : num_positions;
if(ctx->load_image_size==nullptr){ if(ctx->load_image_size==nullptr){
ctx->load_image_size= clip_image_size_init(); ctx->load_image_size= clip_image_size_init();
} }

View file

@ -24,7 +24,9 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla
const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0); const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0); const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
auto img_tokens = image_embed->n_image_pos; auto img_tokens = image_embed->n_image_pos;
llama_pos mrope_pos[img_tokens * 4]; // llama_pos mrope_pos[img_tokens * 4];
std::vector<llama_pos> mrope_pos;
mrope_pos.resize(img_tokens * 4);
for (int y = 0; y < ph; y++) for (int y = 0; y < ph; y++)
{ {
@ -350,7 +352,7 @@ static void llava_free(struct llava_context * ctx_llava) {
#ifndef NDEBUG #ifndef NDEBUG
static void tmp_test_rope(struct llava_context * ctx_llava, common_params * params) { static void tmp_test_rope() {
int n_threads = 1; int n_threads = 1;
static size_t buf_size = 512u*1024*1024; static size_t buf_size = 512u*1024*1024;
@ -415,13 +417,13 @@ static void tmp_test_rope(struct llava_context * ctx_llava, common_params * para
} }
} }
static void tmp_dump_img_embed(struct llava_context * ctx_llava, common_params * params) { static void tmp_dump_img_embed(struct llava_context * ctx_llava) {
// auto * image_embed = load_image(ctx_llava, params, "/home/ron/Downloads/gguf/dog.jpeg");
int n_embd = llama_n_embd(llama_get_model(ctx_llava->ctx_llama)); int n_embd = llama_n_embd(llama_get_model(ctx_llava->ctx_llama));
// int ne = n_embd * image_embed->n_image_pos;
int ne = n_embd * 4; int ne = n_embd * 4;
float vals[56 * 56 * 3]; float vals[56 * 56 * 3];
float embd[ne]; // float embd[ne];
std::vector<float> embd;
embd.resize(ne);
for (int i = 0; i < 56*56; i++) for (int i = 0; i < 56*56; i++)
{ {
@ -429,12 +431,11 @@ static void tmp_dump_img_embed(struct llava_context * ctx_llava, common_params *
vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56); vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56);
} }
// auto param = &ctx_llava->ctx_clip->vision_model.hparams; clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data());
clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd);
std::ofstream outFile("img_embed.bin", std::ios::binary); std::ofstream outFile("img_embed.bin", std::ios::binary);
if (outFile.is_open()) { if (outFile.is_open()) {
outFile.write(reinterpret_cast<const char*>(embd), ne * sizeof(float)); outFile.write(reinterpret_cast<const char*>(embd.data()), ne * sizeof(float));
outFile.close(); outFile.close();
std::cout << "Data successfully written to mrope.bin" << std::endl; std::cout << "Data successfully written to mrope.bin" << std::endl;
@ -484,7 +485,7 @@ int main(int argc, char ** argv) {
} else if (params.image[0].empty()) { } else if (params.image[0].empty()) {
auto ctx_llava = llava_init_context(&params, model); auto ctx_llava = llava_init_context(&params, model);
tmp_dump_img_embed(ctx_llava, &params); tmp_dump_img_embed(ctx_llava);
llama_perf_context_print(ctx_llava->ctx_llama); llama_perf_context_print(ctx_llava->ctx_llama);
ctx_llava->model = NULL; ctx_llava->model = NULL;

View file

@ -3585,7 +3585,6 @@ struct ggml_tensor * ggml_mrope_ext(
memcpy(params + 9, &beta_fast, sizeof(float)); memcpy(params + 9, &beta_fast, sizeof(float));
memcpy(params + 10, &beta_slow, sizeof(float)); memcpy(params + 10, &beta_slow, sizeof(float));
memcpy(&params[11], sections, sizeof(int)*4); memcpy(&params[11], sections, sizeof(int)*4);
// memcpy(params + 11, sections, sizeof(int)*3);
ggml_set_op_params(result, params, sizeof(params)); ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_ROPE; result->op = GGML_OP_ROPE;

View file

@ -3337,6 +3337,7 @@ struct llama_context {
// whether we are computing encoder output or decoder output // whether we are computing encoder output or decoder output
bool is_encoding = false; bool is_encoding = false;
// TODO: find a better way to accommodate mutli-dimension position encoding methods
// number of position id each token get, 1 for each token in most cases. // number of position id each token get, 1 for each token in most cases.
// when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate. // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
int n_pos_per_token = 1; int n_pos_per_token = 1;
@ -5719,6 +5720,7 @@ static void llm_load_hparams(
std::fill(hparams.rope_mrope_sections.begin(), hparams.rope_mrope_sections.end(), 0); std::fill(hparams.rope_mrope_sections.begin(), hparams.rope_mrope_sections.end(), 0);
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_mrope_sections, 4, true); ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_mrope_sections, 4, true);
} }
// fall through
case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

View file

@ -138,7 +138,7 @@ int main(int /*argc*/, const char ** /*argv*/) {
struct ggml_tensor * x; struct ggml_tensor * x;
// rope f32 // rope f32
for (int m = 0; m < 3; ++m) { for (int m = 0; m < 5; ++m) {
const int ndims = 4; const int ndims = 4;
const int64_t n_rot = 128; const int64_t n_rot = 128;
@ -147,28 +147,69 @@ int main(int /*argc*/, const char ** /*argv*/) {
const int n_past_0 = 100; const int n_past_0 = 100;
const int n_past_2 = 33; const int n_past_2 = 33;
struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]); struct ggml_tensor * r0;
struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]); struct ggml_tensor * r1;
struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]); struct ggml_tensor * r2;
for (int i = 0; i < ne[2]; ++i) {
((int32_t *) p0->data)[i] = n_past_0 + i;
((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
((int32_t *) p2->data)[i] = n_past_2 + i;
}
// test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
int mode = -1;
// 100, 101, 102, ..., 172 if (m < 3) {
struct ggml_tensor * r0 = ggml_rope(ctx0, x, p0, n_rot, mode); struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
// -67, -67, -67, ..., -67 struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
// 33, 34, 35, ..., 105 for (int i = 0; i < ne[2]; ++i) {
struct ggml_tensor * r2 = ggml_rope(ctx0, x, p2, n_rot, mode); ((int32_t *) p0->data)[i] = n_past_0 + i;
((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
((int32_t *) p2->data)[i] = n_past_2 + i;
}
// test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
mode = m == 0 ? 0 : m == 1 ? 2 : 4;
// 100, 101, 102, ..., 172
r0 = ggml_rope(ctx0, x, p0, n_rot, mode);
// -67, -67, -67, ..., -67
r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
// 33, 34, 35, ..., 105
r2 = ggml_rope(ctx0, x, p2, n_rot, mode);
} else {
// testing multi-dimension rope position embedding mode
struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
int sections[4] = {16, 24, 24, 0};
mode = (m == 3) ? GGML_ROPE_TYPE_MROPE : GGML_ROPE_TYPE_VISION;
for (int i = 0; i < ne[2]; ++i) {
for (int j = 0; j < 4; ++j) {
((int32_t *) p0->data)[i + ne[2] * j] = n_past_0 + i + j;
((int32_t *) p1->data)[i + ne[2] * j] = n_past_2 - n_past_0;
((int32_t *) p2->data)[i + ne[2] * j] = n_past_2 + i + j;
}
}
// [[100, 101, 102, ..., 172],
// [101, 102, 103, ..., 173],
// [102, 103, 104, ..., 174]]
r0 = ggml_mrope_ext(
ctx0, x, p0, nullptr,
n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
// [[-67, -67, -67, ..., -67]
// [-67, -67, -67, ..., -67]
// [-67, -67, -67, ..., -67]]
r1 = ggml_mrope_ext(
ctx0, r0, p1, nullptr,
n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
// [[33, 34, 35, ..., 105]
// [34, 35, 36, ..., 106]
// [35, 36, 37, ..., 107]]
r2 = ggml_mrope_ext(
ctx0, x, p2, nullptr,
n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
}
ggml_cgraph * gf = ggml_new_graph(ctx0); ggml_cgraph * gf = ggml_new_graph(ctx0);