diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 6ac70ba69..04fe6bef0 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -167,11 +167,29 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ } printf("system_prompt: %s\n", system_prompt.c_str()); + if (params->verbose_prompt) { + auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + } + } printf("user_prompt: %s\n", user_prompt.c_str()); + if (params->verbose_prompt) { + auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + } + } } else { // llava-1.5 native mode system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; user_prompt = prompt + "\nASSISTANT:"; + if (params->verbose_prompt) { + auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + } + } } eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos); @@ -183,13 +201,17 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ fprintf(stderr, "\n"); struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); - + std::string response = ""; for (int i = 0; i < max_tgt_len; i++) { const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); + response += tmp; if (strcmp(tmp, "") == 0) break; if (strstr(tmp, "###")) break; // Yi-VL behavior - printf("%s", tmp); + if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) + if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 + if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 + fflush(stdout); } diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 4ba89eb97..ff99a688e 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -116,7 +116,7 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_emb memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context // append without newline tokens (default behavior in llava_arch when not using unpad ): memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (image_embd_v.size()-1)); // grid patches - *n_img_pos_out = result->ne[1]+clip_n_patches(ctx_clip); + *n_img_pos_out = static_cast(result->ne[1]+clip_n_patches(ctx_clip)); // Debug: Test single segments // Current findings: sending base image, sending a segment embedding all works similar to python @@ -179,12 +179,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside clip_image_f32_free(img_res_v[i]); if (!encoded) { - fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", i+1, img_res_v.size()); + fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", i+1, (int)img_res_v.size()); return false; } } const int64_t t_img_enc_batch_us = ggml_time_us(); - printf("%s: %d segments encoded in %8.2f ms\n", __func__, img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); std::vector> grid_pinpoints;