llama : add support for GLM-Edge and GLM-Edge-V series models (#10573)
* add glm edge chat model * use config partial_rotary_factor as rope ratio * support for glm edge model * vision model support * remove debug info * fix format * llava.cpp trailing whitespace * remove unused AutoTokenizer * Update src/llama.cpp for not contain <|end|> or </s> Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> * add edge template * fix chat template * fix confict * fix confict * fix ci err * fix format err * fix template err * 9b hf chat support * format * format clip.cpp * fix format * Apply suggestions from code review * Apply suggestions from code review * Update examples/llava/clip.cpp * fix format * minor : style --------- Co-authored-by: liyuhang <yuhang.li@zhipuai.cn> Co-authored-by: piDack <pcdack@hotmail.co> Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> Co-authored-by: liyuhang <yuhang.li@aminer.cn> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
53debe6f3c
commit
0cec062a63
15 changed files with 568 additions and 67 deletions
|
@ -311,6 +311,20 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||
img_res_v.size = 0;
|
||||
img_res_v.data = nullptr;
|
||||
}
|
||||
else if (clip_is_glm(ctx_clip)){
|
||||
struct clip_image_size * load_image_size = clip_image_size_init();
|
||||
load_image_size->width = img_res_v.data[0].nx;
|
||||
load_image_size->height = img_res_v.data[0].ny;
|
||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||
|
||||
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
|
||||
int pos = int(load_image_size->width/clip_patch_size(ctx_clip)/2);
|
||||
*n_img_pos = (pos * pos + 2);
|
||||
if (!encoded){
|
||||
LOG_ERR("Unable to encode image \n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
||||
// flat / default llava-1.5 type embedding
|
||||
*n_img_pos = clip_n_patches(ctx_clip);
|
||||
|
@ -395,6 +409,9 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
|
|||
if (clip_is_minicpmv(ctx_clip)) {
|
||||
num_max_patches = 10;
|
||||
}
|
||||
if (clip_is_glm(ctx_clip)) {
|
||||
num_max_patches = 1;
|
||||
}
|
||||
float * image_embd;
|
||||
if (clip_is_qwen2vl(ctx_clip)) {
|
||||
// qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue