Add vl-rope/2d-rope support for qwen2vl ViT
This commit is contained in:
parent
35411963d2
commit
9d389a051b
4 changed files with 109 additions and 17 deletions
|
@ -463,9 +463,10 @@ static void tmp_test_mrope(struct llava_context * ctx_llava, gpt_params * params
|
|||
for (int i = 60; i < 90; i ++) pos_id[i] = i - 0;
|
||||
memcpy(pos->data, pos_id.data(), 90 * ggml_element_size(pos));
|
||||
|
||||
int sections[3] = {16, 24, 24};
|
||||
auto encode = ggml_mrope_ext(
|
||||
ctx0, inp_raw, pos, nullptr,
|
||||
128, LLAMA_ROPE_TYPE_NEOX, 32768, 1000000, 1,
|
||||
128, sections, LLAMA_ROPE_TYPE_NEOX, 32768, 1000000, 1,
|
||||
0, 1, 32, 1);
|
||||
|
||||
ggml_build_forward_expand(gf, encode);
|
||||
|
@ -490,6 +491,70 @@ static void tmp_test_mrope(struct llava_context * ctx_llava, gpt_params * params
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
static void tmp_test_mrope_2d(struct llava_context * ctx_llava, gpt_params * params) {
|
||||
|
||||
int n_threads = 1;
|
||||
static size_t buf_size = 512u*1024*1024;
|
||||
static void * buf = malloc(buf_size);
|
||||
|
||||
struct ggml_init_params init_params = {
|
||||
/*.mem_size =*/ buf_size,
|
||||
/*.mem_buffer =*/ buf,
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(init_params);
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
|
||||
struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 128, 12, 30);
|
||||
ggml_set_name(inp_raw, "inp_raw");
|
||||
ggml_set_input(inp_raw);
|
||||
|
||||
std::vector<float> dummy_q;
|
||||
dummy_q.resize(128 * 12 * 30);
|
||||
std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
|
||||
memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
|
||||
|
||||
struct ggml_tensor * pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, 30 * 3);
|
||||
ggml_set_name(pos, "pos");
|
||||
ggml_set_input(pos);
|
||||
|
||||
std::vector<int> pos_id;
|
||||
pos_id.resize(90);
|
||||
for (int i = 0; i < 30; i ++) pos_id[i] = i;
|
||||
for (int i = 30; i < 60; i ++) pos_id[i] = i - 30;
|
||||
for (int i = 60; i < 90; i ++) pos_id[i] = i - 0;
|
||||
memcpy(pos->data, pos_id.data(), 90 * ggml_element_size(pos));
|
||||
|
||||
int sections[3] = {32, 32, 0};
|
||||
auto encode = ggml_mrope_ext(
|
||||
ctx0, inp_raw, pos, nullptr,
|
||||
128/2, sections, LLAMA_ROPE_TYPE_NEOX, 32768, 1000000, 1,
|
||||
0, 1, 32, 1);
|
||||
|
||||
ggml_build_forward_expand(gf, encode);
|
||||
ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
|
||||
|
||||
std::vector<float> embd;
|
||||
embd.resize(128 * 12 * 30);
|
||||
memcpy(
|
||||
embd.data(),
|
||||
(float *) ggml_get_data(encode),
|
||||
sizeof(float) * 128 * 12 * 30);
|
||||
ggml_free(ctx0);
|
||||
|
||||
std::ofstream outFile("mrope_2d.bin", std::ios::binary);
|
||||
if (outFile.is_open()) {
|
||||
outFile.write(reinterpret_cast<const char*>(embd.data()), embd.size() * sizeof(int));
|
||||
|
||||
outFile.close();
|
||||
std::cout << "Data successfully written to mrope.bin" << std::endl;
|
||||
} else {
|
||||
std::cerr << "Error opening file!" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
-----------------------------------------------------------------------------------------------------------------
|
||||
*/
|
||||
|
@ -542,7 +607,8 @@ int main(int argc, char ** argv) {
|
|||
// process the prompt
|
||||
// tmp_test_conv2d_reshape(ctx_llava, ¶ms);
|
||||
// tmp_test_rope(ctx_llava, ¶ms);
|
||||
tmp_test_mrope(ctx_llava, ¶ms);
|
||||
// tmp_test_mrope(ctx_llava, ¶ms);
|
||||
tmp_test_mrope_2d(ctx_llava, ¶ms);
|
||||
// process_prompt(ctx_llava, nullptr, ¶ms, params.prompt);
|
||||
|
||||
llama_print_timings(ctx_llava->ctx_llama);
|
||||
|
|
|
@ -1451,6 +1451,7 @@ extern "C" {
|
|||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * c,
|
||||
int n_dims,
|
||||
int sections[3],
|
||||
int mode,
|
||||
int n_ctx_orig,
|
||||
float freq_base,
|
||||
|
|
|
@ -3559,6 +3559,7 @@ struct ggml_tensor * ggml_mrope_ext(
|
|||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * c,
|
||||
int n_dims,
|
||||
int sections[3],
|
||||
int mode,
|
||||
int n_ctx_orig,
|
||||
float freq_base,
|
||||
|
@ -3568,8 +3569,6 @@ struct ggml_tensor * ggml_mrope_ext(
|
|||
float beta_fast,
|
||||
float beta_slow) {
|
||||
|
||||
int sections[3] = {16, 24, 24}; // TODO: move this into gguf model file.
|
||||
|
||||
GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
|
||||
|
||||
GGML_ASSERT(ggml_is_vector(b));
|
||||
|
@ -3596,7 +3595,8 @@ struct ggml_tensor * ggml_mrope_ext(
|
|||
memcpy(params + 8, &attn_factor, sizeof(float));
|
||||
memcpy(params + 9, &beta_fast, sizeof(float));
|
||||
memcpy(params + 10, &beta_slow, sizeof(float));
|
||||
memcpy(params + 11, §ions, sizeof(int) * 3);
|
||||
memcpy(¶ms[11], sections, sizeof(int)*3);
|
||||
// memcpy(params + 11, sections, sizeof(int)*3);
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_ROPE;
|
||||
|
@ -11238,7 +11238,7 @@ static void ggml_rope_cache_init(
|
|||
}
|
||||
|
||||
static void ggml_mrope_cache_init(
|
||||
float theta_base_t, float theta_base_h, float theta_base_w, int sections[3],
|
||||
float theta_base_t, float theta_base_h, float theta_base_w, int sections[3], bool indep_sects,
|
||||
float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
||||
float * cache, float sin_sign, float theta_scale) {
|
||||
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
||||
|
@ -11246,12 +11246,25 @@ static void ggml_mrope_cache_init(
|
|||
float theta_h = theta_base_h;
|
||||
float theta_w = theta_base_w;
|
||||
int sect_dims = sections[0] + sections[1] + sections[2];
|
||||
int prev_sector = -1;
|
||||
|
||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||
const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
|
||||
float theta = theta_t;
|
||||
int sector = (i0 / 2) % sect_dims;
|
||||
|
||||
int sector = (i0 / 2) % sect_dims;
|
||||
if (indep_sects) {
|
||||
if (sector == 0) {
|
||||
theta_t = theta_base_t;
|
||||
}
|
||||
else if (sector == sections[0]) {
|
||||
theta_h = theta_base_h;;
|
||||
}
|
||||
else if (sector == sections[1]) {
|
||||
theta_w = theta_base_w;
|
||||
}
|
||||
}
|
||||
|
||||
float theta = theta_t;
|
||||
if (sector < sections[1] + sections[0] && sector >= sections[0]) {
|
||||
theta = theta_h;
|
||||
}
|
||||
|
@ -11267,6 +11280,7 @@ static void ggml_mrope_cache_init(
|
|||
theta_t *= theta_scale;
|
||||
theta_w *= theta_scale;
|
||||
theta_h *= theta_scale;
|
||||
prev_sector = sector;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -11366,7 +11380,7 @@ static void ggml_compute_forward_rope_f32(
|
|||
const int64_t p_h = pos[i2 + ne2];
|
||||
const int64_t p_w = pos[i2 + ne2 * 2];
|
||||
ggml_mrope_cache_init(
|
||||
p_t, p_h, p_w, sections,
|
||||
p_t, p_h, p_w, sections, sections[2] == 0,
|
||||
freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
||||
}
|
||||
|
||||
|
@ -11406,12 +11420,23 @@ static void ggml_compute_forward_rope_f32(
|
|||
}
|
||||
}
|
||||
|
||||
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
||||
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
if (is_mrope) {
|
||||
// fill the remain channels by repeating 0~n_dims channel
|
||||
for (int64_t i0 = n_dims; i0 < ne0; i0 ++) {
|
||||
float * dst_data_0 = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
|
||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
dst_data[0] = dst_data_0[i0 % n_dims];
|
||||
}
|
||||
}
|
||||
else {
|
||||
// fill the remain channels with data from src tensor
|
||||
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
||||
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
dst_data[0] = src[0];
|
||||
dst_data[1] = src[1];
|
||||
dst_data[0] = src[0];
|
||||
dst_data[1] = src[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -12510,7 +12510,6 @@ struct llm_build_context {
|
|||
|
||||
struct ggml_cgraph * build_qwen2vl() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
@ -12529,6 +12528,7 @@ struct llm_build_context {
|
|||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
int sections[3] = {16, 24, 24}; // TODO: move this into gguf model file.
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
@ -12560,14 +12560,14 @@ struct llm_build_context {
|
|||
Qcur = ggml_mrope_ext(
|
||||
ctx0,
|
||||
ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_mrope_ext(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue