diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 2f302d935..c61a4d415 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -761,7 +761,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 if (ctx->has_qwen2vl_merger) { Q = ggml_mrope_ext( ctx0, Q, positions, nullptr, - d_head/2, mrope_sections, 2 /*LLAMA_ROPE_TYPE_NEOX8*/, 32768, 10000, 1, 0, 1, 32, 1); + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); } Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); @@ -774,7 +774,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 if (ctx->has_qwen2vl_merger) { K = ggml_mrope_ext( ctx0, K, positions, nullptr, - d_head/2, mrope_sections, 2 /*LLAMA_ROPE_TYPE_NEOX8*/, 32768, 10000, 1, 0, 1, 32, 1); + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); } K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); @@ -1301,8 +1301,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { idx = get_key_idx(ctx, KEY_USE_GELU); new_clip->use_gelu = gguf_get_val_bool(ctx, idx); - idx = get_key_idx(ctx, KEY_USE_SILU); - new_clip->use_silu = gguf_get_val_bool(ctx, idx); + try { + idx = get_key_idx(ctx, KEY_USE_SILU); + new_clip->use_silu = gguf_get_val_bool(ctx, idx); + } catch (std::runtime_error & /*e*/) { + new_clip->use_silu = false; + } if (verbosity >= 1) { LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 5d01181dc..56c6f2c05 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -237,7 +237,9 @@ #define GGML_EXIT_SUCCESS 0 #define GGML_EXIT_ABORTED 1 -#define GGML_ROPE_TYPE_NEOX 2 +#define GGML_ROPE_TYPE_NEOX 2 +#define GGML_ROPE_TYPE_MROPE 4 +#define GGML_ROPE_TYPE_VISION 12 #define GGUF_MAGIC "GGUF" diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index 9a66badb9..b09ecac19 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -414,11 +414,15 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - memcpy(§ions.v, (int32_t *) dst->op_params + 11, sizeof(int)*4); + memcpy(§ions.v, (int32_t *) dst->op_params + 11, sizeof(int)*4); - const bool is_mrope = sections.v[0] > 0 || sections.v[1] > 0 || sections.v[2] > 0; - const bool is_vision = is_mrope && sections.v[3] > 0; - const bool is_neox = (mode & GGML_ROPE_TYPE_NEOX) & !(is_mrope || is_vision); // TODO: fix this with new rope type + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; + const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; + const bool is_vision = mode == GGML_ROPE_TYPE_VISION; + + if (is_mrope) { + GGML_ASSERT(sections.v[0] > 0 || sections.v[1] > 0 || sections.v[2] > 0); + } if (is_vision) { GGML_ASSERT(n_dims == ne00/2); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 7b16637cd..0cb53e13b 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -11359,8 +11359,12 @@ static void ggml_compute_forward_rope_f32( ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - const bool is_mrope = sections[0] > 0 || sections[1] > 0 || sections[2] > 0; - const bool is_vision = is_mrope && sections[3] > 0; + const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; + const bool is_vision = mode == GGML_ROPE_TYPE_VISION; + + if (is_mrope) { + GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0); + } if (is_vision) { GGML_ASSERT(n_dims == ne0/2); @@ -11539,8 +11543,12 @@ static void ggml_compute_forward_rope_f16( ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - const bool is_mrope = sections[0] > 0 || sections[1] > 0 || sections[2] > 0; - const bool is_vision = is_mrope && sections[3] > 0; + const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; + const bool is_vision = mode == GGML_ROPE_TYPE_VISION; + + if (is_mrope) { + GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0); + } if (is_vision) { GGML_ASSERT(n_dims == ne0/2); @@ -11562,7 +11570,6 @@ static void ggml_compute_forward_rope_f16( for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i2 = 0; i2 < ne2; i2++) { - const int64_t p = pos[i2]; float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith; if (!is_mrope) { diff --git a/include/llama.h b/include/llama.h index ab5e376e6..5f181d4c7 100644 --- a/include/llama.h +++ b/include/llama.h @@ -107,9 +107,11 @@ extern "C" { }; enum llama_rope_type { - LLAMA_ROPE_TYPE_NONE = -1, - LLAMA_ROPE_TYPE_NORM = 0, - LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX, + LLAMA_ROPE_TYPE_NONE = -1, + LLAMA_ROPE_TYPE_NORM = 0, + LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX, + LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE, + LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION, }; enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file diff --git a/src/llama.cpp b/src/llama.cpp index fdc93dcd6..7c4172a42 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -20237,7 +20237,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_BITNET: case LLM_ARCH_QWEN: case LLM_ARCH_QWEN2: - case LLM_ARCH_QWEN2VL: case LLM_ARCH_QWEN2MOE: case LLM_ARCH_OLMO2: case LLM_ARCH_OLMOE: @@ -20253,6 +20252,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_EXAONE: case LLM_ARCH_MINICPM3: return LLAMA_ROPE_TYPE_NEOX; + + case LLM_ARCH_QWEN2VL: + return LLAMA_ROPE_TYPE_MROPE; // all model arches should be listed explicitly here case LLM_ARCH_UNKNOWN: