llama : add Qwen2VL support + multimodal RoPE (#10361)

* Barebone Qwen2VL LLM convertor

* Add Qwen2VL cli entrypoint

* [WIP] add qwen2vl arch

* Verify m-rope output

* Add vl-rope/2d-rope support for qwen2vl ViT

* update qwen2vl cli tool

* update 5D tensor op workaround

* [WIP] qwen2vl vision model

* make batch and clip utils compatible with qwen2vl

* [WIP] create inference workflow, gguf convert script but fix

* correcting vision-rope behavior, add the missing last layer back to ViT

* add arg parser to qwen2vl_surgery

* replace variable size array with vector

* cuda-gdb cmake preset

* add fp32 mrope, vision rope kernel

* add fp16 support for qwen2vl and m-rope

* add `GGML_ROPE_TYPE_MROPE`, `GGML_ROPE_TYPE_VISION`

* fix rope op mode switching, out dated func args

* update `llama_hparams`

* update to keep up stream changes

* resolve linter, test errors

* add makefile entry, update speical image padding token

* add mrope unit test, fix few compiler warnings

* rename `mrope` related function, params

* minor updates on debug util, bug fixs

* add `m-rope` testcase to `test-backend-ops`

* Apply suggestions from code review

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* fix traililng whitespce

* store `llama_hparams.rope_sections` with fixed size array

* update position id tensor size check in GGML_OP_ROPE

* minor updates

* update `ggml_backend_*_supports_op` of unsupported backends

* remote old `rope_section` compare operator

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
HimariO 2024-12-14 20:43:46 +08:00 committed by GitHub
parent 56eea0781c
commit ba1cb19cdd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
24 changed files with 1873 additions and 114 deletions

View file

@ -138,7 +138,7 @@ int main(int /*argc*/, const char ** /*argv*/) {
struct ggml_tensor * x;
// rope f32
for (int m = 0; m < 3; ++m) {
for (int m = 0; m < 5; ++m) {
const int ndims = 4;
const int64_t n_rot = 128;
@ -147,28 +147,69 @@ int main(int /*argc*/, const char ** /*argv*/) {
const int n_past_0 = 100;
const int n_past_2 = 33;
struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
for (int i = 0; i < ne[2]; ++i) {
((int32_t *) p0->data)[i] = n_past_0 + i;
((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
((int32_t *) p2->data)[i] = n_past_2 + i;
}
// test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
struct ggml_tensor * r0;
struct ggml_tensor * r1;
struct ggml_tensor * r2;
x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
int mode = -1;
// 100, 101, 102, ..., 172
struct ggml_tensor * r0 = ggml_rope(ctx0, x, p0, n_rot, mode);
// -67, -67, -67, ..., -67
struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
if (m < 3) {
struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
// 33, 34, 35, ..., 105
struct ggml_tensor * r2 = ggml_rope(ctx0, x, p2, n_rot, mode);
for (int i = 0; i < ne[2]; ++i) {
((int32_t *) p0->data)[i] = n_past_0 + i;
((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
((int32_t *) p2->data)[i] = n_past_2 + i;
}
// test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
mode = m == 0 ? 0 : m == 1 ? 2 : 4;
// 100, 101, 102, ..., 172
r0 = ggml_rope(ctx0, x, p0, n_rot, mode);
// -67, -67, -67, ..., -67
r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
// 33, 34, 35, ..., 105
r2 = ggml_rope(ctx0, x, p2, n_rot, mode);
} else {
// testing multi-dimension rope position embedding mode
struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
int sections[4] = {16, 24, 24, 0};
mode = (m == 3) ? GGML_ROPE_TYPE_MROPE : GGML_ROPE_TYPE_VISION;
for (int i = 0; i < ne[2]; ++i) {
for (int j = 0; j < 4; ++j) {
((int32_t *) p0->data)[i + ne[2] * j] = n_past_0 + i + j;
((int32_t *) p1->data)[i + ne[2] * j] = n_past_2 - n_past_0;
((int32_t *) p2->data)[i + ne[2] * j] = n_past_2 + i + j;
}
}
// [[100, 101, 102, ..., 172],
// [101, 102, 103, ..., 173],
// [102, 103, 104, ..., 174]]
r0 = ggml_rope_multi(
ctx0, x, p0, nullptr,
n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
// [[-67, -67, -67, ..., -67]
// [-67, -67, -67, ..., -67]
// [-67, -67, -67, ..., -67]]
r1 = ggml_rope_multi(
ctx0, r0, p1, nullptr,
n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
// [[33, 34, 35, ..., 105]
// [34, 35, 36, ..., 106]
// [35, 36, 37, ..., 107]]
r2 = ggml_rope_multi(
ctx0, x, p2, nullptr,
n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
}
ggml_cgraph * gf = ggml_new_graph(ctx0);