diff --git a/src/llama.cpp b/src/llama.cpp index c409b162e..312e6dafb 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5244,7 +5244,7 @@ static void llm_load_hparams( case LLM_ARCH_CHAMELEON: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default + hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm); switch (hparams.n_layer) { @@ -13718,11 +13718,11 @@ struct llm_build_context { struct ggml_tensor * inpSA = inpL; // norm - if (!hparams.swin_norm) { + if (!hparams.swin_norm) { cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); - } + } cb(cur, "attn_norm", il); // self-attention @@ -13780,11 +13780,11 @@ struct llm_build_context { model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - if (hparams.swin_norm) { + if (hparams.swin_norm) { cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); - } + } } if (il == n_layer - 1) { @@ -13799,12 +13799,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - if (!hparams.swin_norm) { + if (!hparams.swin_norm) { cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - } + } cur = llm_build_ffn(ctx0, cur, model.layers[il].ffn_up, NULL, NULL, @@ -13814,12 +13814,12 @@ struct llm_build_context { LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); - if (hparams.swin_norm) { + if (hparams.swin_norm) { cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - } + } cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); @@ -13842,13 +13842,15 @@ struct llm_build_context { cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output_with_img_logits", -1); - int img_token_end_idx = 8196; - int img_token_start_idx = 4; - int num_img_tokens = img_token_end_idx - img_token_start_idx; - struct ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens); - img_logits = ggml_add1(ctx0, img_logits, ggml_new_f32(ctx0, -FLT_MAX)); - cb(img_logits, "img_logits", -1); - cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx); + // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs. + // Needs to be removed once image outputs are supported. + int img_token_end_idx = 8196; + int img_token_start_idx = 4; + int num_img_tokens = img_token_end_idx - img_token_start_idx; + struct ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens); + img_logits = ggml_add1(ctx0, img_logits, ggml_new_f32(ctx0, -FLT_MAX)); + cb(img_logits, "img_logits", -1); + cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -15713,8 +15715,8 @@ struct llm_tokenizer_bpe { break; case LLAMA_VOCAB_PRE_TYPE_CHAMELEON: regex_exprs = { - "", // Sentinel tokens - "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z", // Image tokens + "", // Sentinel tokens + "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z", // Image tokens "([\t\n]| | )", // directly from tokenizer.json "\\p{N}", // Individual digits "[\\p{P}\\$\\+<=>\\^~\\|`]+", // Punctuation