llama : add Qwen2VL support + multimodal RoPE (#10361)
* Barebone Qwen2VL LLM convertor * Add Qwen2VL cli entrypoint * [WIP] add qwen2vl arch * Verify m-rope output * Add vl-rope/2d-rope support for qwen2vl ViT * update qwen2vl cli tool * update 5D tensor op workaround * [WIP] qwen2vl vision model * make batch and clip utils compatible with qwen2vl * [WIP] create inference workflow, gguf convert script but fix * correcting vision-rope behavior, add the missing last layer back to ViT * add arg parser to qwen2vl_surgery * replace variable size array with vector * cuda-gdb cmake preset * add fp32 mrope, vision rope kernel * add fp16 support for qwen2vl and m-rope * add `GGML_ROPE_TYPE_MROPE`, `GGML_ROPE_TYPE_VISION` * fix rope op mode switching, out dated func args * update `llama_hparams` * update to keep up stream changes * resolve linter, test errors * add makefile entry, update speical image padding token * add mrope unit test, fix few compiler warnings * rename `mrope` related function, params * minor updates on debug util, bug fixs * add `m-rope` testcase to `test-backend-ops` * Apply suggestions from code review Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * fix traililng whitespce * store `llama_hparams.rope_sections` with fixed size array * update position id tensor size check in GGML_OP_ROPE * minor updates * update `ggml_backend_*_supports_op` of unsupported backends * remote old `rope_section` compare operator --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
56eea0781c
commit
ba1cb19cdd
24 changed files with 1873 additions and 114 deletions
|
@ -2201,7 +2201,15 @@ struct test_rope : public test_case {
|
|||
ggml_set_name(a, "a");
|
||||
}
|
||||
|
||||
ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
|
||||
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
|
||||
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
||||
|
||||
ggml_tensor * pos;
|
||||
if (is_mrope || is_vision) {
|
||||
pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2] * 4);
|
||||
} else {
|
||||
pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
|
||||
}
|
||||
ggml_set_name(pos, "pos");
|
||||
|
||||
ggml_tensor * freq = nullptr;
|
||||
|
@ -2210,7 +2218,20 @@ struct test_rope : public test_case {
|
|||
ggml_set_name(freq, "freq");
|
||||
}
|
||||
|
||||
ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
|
||||
ggml_tensor * out;
|
||||
if (is_mrope) {
|
||||
if (is_vision) {
|
||||
GGML_ASSERT(n_dims/4 > 0);
|
||||
int rope_sections[4] = {n_dims/4, n_dims/4, 0, 0}; // Vision-RoPE only use first two dimension for image (x, y) coordinate
|
||||
out = ggml_rope_multi(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
|
||||
} else {
|
||||
GGML_ASSERT(n_dims/3 > 0);
|
||||
int rope_sections[4] = {n_dims/3, n_dims/3, n_dims/3, 0};
|
||||
out = ggml_rope_multi(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
|
||||
}
|
||||
} else {
|
||||
out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
|
||||
}
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
return out;
|
||||
|
@ -2220,11 +2241,12 @@ struct test_rope : public test_case {
|
|||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (t->type == GGML_TYPE_I32) {
|
||||
// pos
|
||||
std::vector<int> data(ne_a[2]);
|
||||
for (int i = 0; i < ne_a[2]; i++) {
|
||||
const int num_pos_ids = (mode & GGML_ROPE_TYPE_MROPE) ? ne_a[2] * 4 : ne_a[2];
|
||||
std::vector<int> data(num_pos_ids);
|
||||
for (int i = 0; i < num_pos_ids; i++) {
|
||||
data[i] = rand() % n_ctx;
|
||||
}
|
||||
ggml_backend_tensor_set(t, data.data(), 0, ne_a[2] * sizeof(int));
|
||||
ggml_backend_tensor_set(t, data.data(), 0, num_pos_ids * sizeof(int));
|
||||
} else {
|
||||
if (t->ne[0] == n_dims/2) {
|
||||
// frequency factors in the range [0.9f, 1.1f]
|
||||
|
@ -3813,6 +3835,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|||
test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
|
||||
}
|
||||
|
||||
if (all) {
|
||||
test_cases.emplace_back(new test_rope(type, {128, 12, 2, 1}, 128, GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl 2B)
|
||||
test_cases.emplace_back(new test_rope(type, {128, 28, 2, 1}, 128, GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl 7B)
|
||||
test_cases.emplace_back(new test_rope(type, { 80, 16, 2, 1}, 80, GGML_ROPE_TYPE_VISION, 512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl ViT)
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -138,7 +138,7 @@ int main(int /*argc*/, const char ** /*argv*/) {
|
|||
struct ggml_tensor * x;
|
||||
|
||||
// rope f32
|
||||
for (int m = 0; m < 3; ++m) {
|
||||
for (int m = 0; m < 5; ++m) {
|
||||
const int ndims = 4;
|
||||
|
||||
const int64_t n_rot = 128;
|
||||
|
@ -147,28 +147,69 @@ int main(int /*argc*/, const char ** /*argv*/) {
|
|||
const int n_past_0 = 100;
|
||||
const int n_past_2 = 33;
|
||||
|
||||
struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
||||
struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
||||
struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
||||
|
||||
for (int i = 0; i < ne[2]; ++i) {
|
||||
((int32_t *) p0->data)[i] = n_past_0 + i;
|
||||
((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
|
||||
((int32_t *) p2->data)[i] = n_past_2 + i;
|
||||
}
|
||||
|
||||
// test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
|
||||
const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
|
||||
|
||||
struct ggml_tensor * r0;
|
||||
struct ggml_tensor * r1;
|
||||
struct ggml_tensor * r2;
|
||||
x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
|
||||
int mode = -1;
|
||||
|
||||
// 100, 101, 102, ..., 172
|
||||
struct ggml_tensor * r0 = ggml_rope(ctx0, x, p0, n_rot, mode);
|
||||
// -67, -67, -67, ..., -67
|
||||
struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
|
||||
if (m < 3) {
|
||||
struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
||||
struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
||||
struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
||||
|
||||
// 33, 34, 35, ..., 105
|
||||
struct ggml_tensor * r2 = ggml_rope(ctx0, x, p2, n_rot, mode);
|
||||
for (int i = 0; i < ne[2]; ++i) {
|
||||
((int32_t *) p0->data)[i] = n_past_0 + i;
|
||||
((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
|
||||
((int32_t *) p2->data)[i] = n_past_2 + i;
|
||||
}
|
||||
// test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
|
||||
mode = m == 0 ? 0 : m == 1 ? 2 : 4;
|
||||
|
||||
// 100, 101, 102, ..., 172
|
||||
r0 = ggml_rope(ctx0, x, p0, n_rot, mode);
|
||||
// -67, -67, -67, ..., -67
|
||||
r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
|
||||
|
||||
// 33, 34, 35, ..., 105
|
||||
r2 = ggml_rope(ctx0, x, p2, n_rot, mode);
|
||||
} else {
|
||||
// testing multi-dimension rope position embedding mode
|
||||
struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
|
||||
struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
|
||||
struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
|
||||
|
||||
int sections[4] = {16, 24, 24, 0};
|
||||
mode = (m == 3) ? GGML_ROPE_TYPE_MROPE : GGML_ROPE_TYPE_VISION;
|
||||
|
||||
for (int i = 0; i < ne[2]; ++i) {
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
((int32_t *) p0->data)[i + ne[2] * j] = n_past_0 + i + j;
|
||||
((int32_t *) p1->data)[i + ne[2] * j] = n_past_2 - n_past_0;
|
||||
((int32_t *) p2->data)[i + ne[2] * j] = n_past_2 + i + j;
|
||||
}
|
||||
}
|
||||
|
||||
// [[100, 101, 102, ..., 172],
|
||||
// [101, 102, 103, ..., 173],
|
||||
// [102, 103, 104, ..., 174]]
|
||||
r0 = ggml_rope_multi(
|
||||
ctx0, x, p0, nullptr,
|
||||
n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
|
||||
// [[-67, -67, -67, ..., -67]
|
||||
// [-67, -67, -67, ..., -67]
|
||||
// [-67, -67, -67, ..., -67]]
|
||||
r1 = ggml_rope_multi(
|
||||
ctx0, r0, p1, nullptr,
|
||||
n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
|
||||
|
||||
// [[33, 34, 35, ..., 105]
|
||||
// [34, 35, 36, ..., 106]
|
||||
// [35, 36, 37, ..., 107]]
|
||||
r2 = ggml_rope_multi(
|
||||
ctx0, x, p2, nullptr,
|
||||
n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
|
||||
}
|
||||
|
||||
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue