Add vl-rope/2d-rope support for qwen2vl ViT

This commit is contained in:
HimariO 2024-09-30 22:30:02 +08:00
parent 35411963d2
commit 9d389a051b
4 changed files with 109 additions and 17 deletions

View file

@ -463,9 +463,10 @@ static void tmp_test_mrope(struct llava_context * ctx_llava, gpt_params * params
for (int i = 60; i < 90; i ++) pos_id[i] = i - 0;
memcpy(pos->data, pos_id.data(), 90 * ggml_element_size(pos));
int sections[3] = {16, 24, 24};
auto encode = ggml_mrope_ext(
ctx0, inp_raw, pos, nullptr,
128, LLAMA_ROPE_TYPE_NEOX, 32768, 1000000, 1,
128, sections, LLAMA_ROPE_TYPE_NEOX, 32768, 1000000, 1,
0, 1, 32, 1);
ggml_build_forward_expand(gf, encode);
@ -490,6 +491,70 @@ static void tmp_test_mrope(struct llava_context * ctx_llava, gpt_params * params
}
}
static void tmp_test_mrope_2d(struct llava_context * ctx_llava, gpt_params * params) {
int n_threads = 1;
static size_t buf_size = 512u*1024*1024;
static void * buf = malloc(buf_size);
struct ggml_init_params init_params = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf,
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(init_params);
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 128, 12, 30);
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
std::vector<float> dummy_q;
dummy_q.resize(128 * 12 * 30);
std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
struct ggml_tensor * pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, 30 * 3);
ggml_set_name(pos, "pos");
ggml_set_input(pos);
std::vector<int> pos_id;
pos_id.resize(90);
for (int i = 0; i < 30; i ++) pos_id[i] = i;
for (int i = 30; i < 60; i ++) pos_id[i] = i - 30;
for (int i = 60; i < 90; i ++) pos_id[i] = i - 0;
memcpy(pos->data, pos_id.data(), 90 * ggml_element_size(pos));
int sections[3] = {32, 32, 0};
auto encode = ggml_mrope_ext(
ctx0, inp_raw, pos, nullptr,
128/2, sections, LLAMA_ROPE_TYPE_NEOX, 32768, 1000000, 1,
0, 1, 32, 1);
ggml_build_forward_expand(gf, encode);
ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
std::vector<float> embd;
embd.resize(128 * 12 * 30);
memcpy(
embd.data(),
(float *) ggml_get_data(encode),
sizeof(float) * 128 * 12 * 30);
ggml_free(ctx0);
std::ofstream outFile("mrope_2d.bin", std::ios::binary);
if (outFile.is_open()) {
outFile.write(reinterpret_cast<const char*>(embd.data()), embd.size() * sizeof(int));
outFile.close();
std::cout << "Data successfully written to mrope.bin" << std::endl;
} else {
std::cerr << "Error opening file!" << std::endl;
}
}
/*
-----------------------------------------------------------------------------------------------------------------
*/
@ -542,7 +607,8 @@ int main(int argc, char ** argv) {
// process the prompt
// tmp_test_conv2d_reshape(ctx_llava, &params);
// tmp_test_rope(ctx_llava, &params);
tmp_test_mrope(ctx_llava, &params);
// tmp_test_mrope(ctx_llava, &params);
tmp_test_mrope_2d(ctx_llava, &params);
// process_prompt(ctx_llava, nullptr, &params, params.prompt);
llama_print_timings(ctx_llava->ctx_llama);

View file

@ -1451,6 +1451,7 @@ extern "C" {
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int sections[3],
int mode,
int n_ctx_orig,
float freq_base,

View file

@ -3559,6 +3559,7 @@ struct ggml_tensor * ggml_mrope_ext(
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int sections[3],
int mode,
int n_ctx_orig,
float freq_base,
@ -3568,8 +3569,6 @@ struct ggml_tensor * ggml_mrope_ext(
float beta_fast,
float beta_slow) {
int sections[3] = {16, 24, 24}; // TODO: move this into gguf model file.
GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
GGML_ASSERT(ggml_is_vector(b));
@ -3596,7 +3595,8 @@ struct ggml_tensor * ggml_mrope_ext(
memcpy(params + 8, &attn_factor, sizeof(float));
memcpy(params + 9, &beta_fast, sizeof(float));
memcpy(params + 10, &beta_slow, sizeof(float));
memcpy(params + 11, &sections, sizeof(int) * 3);
memcpy(&params[11], sections, sizeof(int)*3);
// memcpy(params + 11, sections, sizeof(int)*3);
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_ROPE;
@ -11238,7 +11238,7 @@ static void ggml_rope_cache_init(
}
static void ggml_mrope_cache_init(
float theta_base_t, float theta_base_h, float theta_base_w, int sections[3],
float theta_base_t, float theta_base_h, float theta_base_w, int sections[3], bool indep_sects,
float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
float * cache, float sin_sign, float theta_scale) {
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
@ -11246,12 +11246,25 @@ static void ggml_mrope_cache_init(
float theta_h = theta_base_h;
float theta_w = theta_base_w;
int sect_dims = sections[0] + sections[1] + sections[2];
int prev_sector = -1;
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
float theta = theta_t;
int sector = (i0 / 2) % sect_dims;
int sector = (i0 / 2) % sect_dims;
if (indep_sects) {
if (sector == 0) {
theta_t = theta_base_t;
}
else if (sector == sections[0]) {
theta_h = theta_base_h;;
}
else if (sector == sections[1]) {
theta_w = theta_base_w;
}
}
float theta = theta_t;
if (sector < sections[1] + sections[0] && sector >= sections[0]) {
theta = theta_h;
}
@ -11267,6 +11280,7 @@ static void ggml_mrope_cache_init(
theta_t *= theta_scale;
theta_w *= theta_scale;
theta_h *= theta_scale;
prev_sector = sector;
}
}
@ -11366,7 +11380,7 @@ static void ggml_compute_forward_rope_f32(
const int64_t p_h = pos[i2 + ne2];
const int64_t p_w = pos[i2 + ne2 * 2];
ggml_mrope_cache_init(
p_t, p_h, p_w, sections,
p_t, p_h, p_w, sections, sections[2] == 0,
freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
}
@ -11406,12 +11420,23 @@ static void ggml_compute_forward_rope_f32(
}
}
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
if (is_mrope) {
// fill the remain channels by repeating 0~n_dims channel
for (int64_t i0 = n_dims; i0 < ne0; i0 ++) {
float * dst_data_0 = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
dst_data[0] = dst_data_0[i0 % n_dims];
}
}
else {
// fill the remain channels with data from src tensor
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
dst_data[0] = src[0];
dst_data[1] = src[1];
dst_data[0] = src[0];
dst_data[1] = src[1];
}
}
}
}

View file

@ -12510,7 +12510,6 @@ struct llm_build_context {
struct ggml_cgraph * build_qwen2vl() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
@ -12529,6 +12528,7 @@ struct llm_build_context {
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
int sections[3] = {16, 24, 24}; // TODO: move this into gguf model file.
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
@ -12560,14 +12560,14 @@ struct llm_build_context {
Qcur = ggml_mrope_ext(
ctx0,
ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_mrope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur", il);