further cleanup; move llava-cli into its own file and rename
This commit is contained in:
parent
0889117573
commit
f83c0606bd
8 changed files with 128 additions and 110 deletions
|
@ -11,9 +11,21 @@ if(TARGET BUILD_INFO)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(TARGET llava)
|
set(TARGET llava)
|
||||||
add_executable(${TARGET} llava.cpp)
|
add_library(${TARGET} llava.cpp llava.h)
|
||||||
|
install(TARGETS ${TARGET} LIBRARY)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
if (NOT MSVC)
|
||||||
|
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
|
||||||
|
endif()
|
||||||
|
if(TARGET BUILD_INFO)
|
||||||
|
add_dependencies(${TARGET} BUILD_INFO)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(TARGET llava-cli)
|
||||||
|
add_executable(${TARGET} llava-cli.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama clip llava ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if(TARGET BUILD_INFO)
|
if(TARGET BUILD_INFO)
|
||||||
add_dependencies(${TARGET} BUILD_INFO)
|
add_dependencies(${TARGET} BUILD_INFO)
|
||||||
|
|
|
@ -9,12 +9,12 @@ models are available.
|
||||||
After API is confirmed, more models will be supported / uploaded.
|
After API is confirmed, more models will be supported / uploaded.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
Build with cmake or run `make llava` to build it.
|
Build with cmake or run `make llava-cli` to build it.
|
||||||
|
|
||||||
After building, run: `./llava` to see the usage. For example:
|
After building, run: `./llava-cli` to see the usage. For example:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./llava -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
|
./llava-cli -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
|
||||||
```
|
```
|
||||||
|
|
||||||
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
|
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
|
||||||
|
|
|
@ -690,7 +690,7 @@ static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_
|
||||||
memcpy(img->data, data, img->size);
|
memcpy(img->data, data, img->size);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clip_image_load_from_bytes(const unsigned char * bytes, int bytes_length, clip_image_u8 * img) {
|
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, clip_image_u8 * img) {
|
||||||
int nx, ny, nc;
|
int nx, ny, nc;
|
||||||
auto data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
auto data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
||||||
if (!data) {
|
if (!data) {
|
||||||
|
|
|
@ -58,7 +58,7 @@ struct clip_image_f32_batch {
|
||||||
struct clip_image_u8 * make_clip_image_u8();
|
struct clip_image_u8 * make_clip_image_u8();
|
||||||
struct clip_image_f32 * make_clip_image_f32();
|
struct clip_image_f32 * make_clip_image_f32();
|
||||||
bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
||||||
bool clip_image_load_from_bytes(const unsigned char * bytes, int bytes_length, clip_image_u8 * img);
|
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, clip_image_u8 * img);
|
||||||
bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
|
bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
|
||||||
bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
|
bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
|
||||||
|
|
||||||
|
|
101
examples/llava/llava-cli.cpp
Normal file
101
examples/llava/llava-cli.cpp
Normal file
|
@ -0,0 +1,101 @@
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstdlib>
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include "clip.h"
|
||||||
|
#include "llava.h"
|
||||||
|
#include "llava-utils.h"
|
||||||
|
|
||||||
|
|
||||||
|
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||||
|
printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||||
|
printf(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool load_image(llava_context * ctx_llava, gpt_params * params, float **image_embd, int * n_image_pos) {
|
||||||
|
// load and preprocess the image
|
||||||
|
clip_image_u8 img;
|
||||||
|
auto prompt = params->prompt;
|
||||||
|
if (prompt_contains_image(prompt)) {
|
||||||
|
if (!params->image.empty()) {
|
||||||
|
printf("using base64 encoded image instead of command line image path\n");
|
||||||
|
}
|
||||||
|
if (!clip_image_load_from_prompt(prompt, &img)) {
|
||||||
|
fprintf(stderr, "%s: can't load image from prompt\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
prompt = remove_image_from_prompt(prompt);
|
||||||
|
} else {
|
||||||
|
if (!clip_image_load_from_file(params->image.c_str(), &img)) {
|
||||||
|
fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
llava_build_img_embed(ctx_llava, params->n_threads, &img, image_embd, n_image_pos);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void process_prompt(struct llava_context * ctx_llava, float * image_embd, int n_img_pos, gpt_params * params, const char * prompt) {
|
||||||
|
int n_past = 0;
|
||||||
|
|
||||||
|
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
||||||
|
|
||||||
|
// llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:"
|
||||||
|
// GG: are we sure that the should be a trailing whitespace at the end of this string?
|
||||||
|
eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER: ", params->n_batch, &n_past);
|
||||||
|
eval_image_embd(ctx_llava->ctx_llama, image_embd, n_img_pos, params->n_batch, &n_past);
|
||||||
|
eval_string(ctx_llava->ctx_llama, prompt, params->n_batch, &n_past);
|
||||||
|
eval_string(ctx_llava->ctx_llama, "\nASSISTANT:", params->n_batch, &n_past);
|
||||||
|
|
||||||
|
// generate the response
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
for (int i = 0; i < max_tgt_len; i++) {
|
||||||
|
const char * tmp = sample(ctx_llava->ctx_llama, *params, &n_past);
|
||||||
|
if (strcmp(tmp, "</s>") == 0) break;
|
||||||
|
|
||||||
|
printf("%s", tmp);
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
ggml_time_init();
|
||||||
|
|
||||||
|
gpt_params params;
|
||||||
|
|
||||||
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
show_additional_info(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||||
|
gpt_print_usage(argc, argv, params);
|
||||||
|
show_additional_info(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto ctx_llava = llava_init(¶ms);
|
||||||
|
if (ctx_llava == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to init llava\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
float * image_embd;
|
||||||
|
int n_image_pos;
|
||||||
|
load_image(ctx_llava, ¶ms, &image_embd, &n_image_pos);
|
||||||
|
|
||||||
|
// process the prompt
|
||||||
|
process_prompt(ctx_llava, image_embd, n_image_pos, ¶ms, params.prompt.c_str());
|
||||||
|
|
||||||
|
llama_print_timings(ctx_llava->ctx_llama);
|
||||||
|
|
||||||
|
free(image_embd);
|
||||||
|
llava_free(ctx_llava);
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -149,19 +149,19 @@ inline const char * sample(struct llama_context * ctx_llama, gpt_params & params
|
||||||
static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
|
static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
|
||||||
static const char* IMG_BASE64_TAG_END = "\">";
|
static const char* IMG_BASE64_TAG_END = "\">";
|
||||||
|
|
||||||
static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
|
inline void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
|
||||||
begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
|
begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
|
||||||
end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
|
end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool prompt_contains_image(const std::string& prompt) {
|
inline bool prompt_contains_image(const std::string& prompt) {
|
||||||
size_t begin, end;
|
size_t begin, end;
|
||||||
find_image_tag_in_prompt(prompt, begin, end);
|
find_image_tag_in_prompt(prompt, begin, end);
|
||||||
return (begin != std::string::npos);
|
return (begin != std::string::npos);
|
||||||
}
|
}
|
||||||
|
|
||||||
// replaces the base64 image tag in the prompt with `replacement`
|
// replaces the base64 image tag in the prompt with `replacement`
|
||||||
static bool get_image_from_prompt(const std::string& prompt, clip_image_u8 * img) {
|
inline bool clip_image_load_from_prompt(const std::string& prompt, clip_image_u8 * img) {
|
||||||
size_t img_base64_str_start, img_base64_str_end;
|
size_t img_base64_str_start, img_base64_str_end;
|
||||||
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
|
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
|
||||||
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
|
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
|
||||||
|
@ -176,7 +176,7 @@ static bool get_image_from_prompt(const std::string& prompt, clip_image_u8 * img
|
||||||
auto required_bytes = base64::required_encode_size(base64_str.size());
|
auto required_bytes = base64::required_encode_size(base64_str.size());
|
||||||
auto img_bytes = std::vector<unsigned char>(required_bytes);
|
auto img_bytes = std::vector<unsigned char>(required_bytes);
|
||||||
auto img_bytes_end = base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
|
auto img_bytes_end = base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
|
||||||
auto img_bytes_len = img_bytes_end - img_bytes.begin();
|
size_t img_bytes_len = img_bytes_end - img_bytes.begin();
|
||||||
|
|
||||||
auto img_loaded_ok = clip_image_load_from_bytes(img_bytes.data(), img_bytes_len, img);
|
auto img_loaded_ok = clip_image_load_from_bytes(img_bytes.data(), img_bytes_len, img);
|
||||||
if (!img_loaded_ok) {
|
if (!img_loaded_ok) {
|
||||||
|
@ -187,7 +187,7 @@ static bool get_image_from_prompt(const std::string& prompt, clip_image_u8 * img
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
|
inline std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
|
||||||
size_t begin, end;
|
size_t begin, end;
|
||||||
find_image_tag_in_prompt(prompt, begin, end);
|
find_image_tag_in_prompt(prompt, begin, end);
|
||||||
if (begin == std::string::npos || end == std::string::npos) {
|
if (begin == std::string::npos || end == std::string::npos) {
|
||||||
|
|
|
@ -10,11 +10,6 @@
|
||||||
|
|
||||||
#include "base64.hpp"
|
#include "base64.hpp"
|
||||||
|
|
||||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
|
||||||
printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
|
||||||
printf(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool encode_image_with_clip(llava_context * ctx_llava, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_embd, int * n_img_pos) {
|
static bool encode_image_with_clip(llava_context * ctx_llava, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_embd, int * n_img_pos) {
|
||||||
auto ctx_clip = ctx_llava->ctx_clip;
|
auto ctx_clip = ctx_llava->ctx_clip;
|
||||||
clip_image_f32 img_res;
|
clip_image_f32 img_res;
|
||||||
|
@ -51,7 +46,7 @@ static bool encode_image_with_clip(llava_context * ctx_llava, int n_threads, con
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llava_build_img_embed(struct llava_context * ctx_llava, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_image_pos_out) {
|
bool llava_build_img_embed(struct llava_context * ctx_llava, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_image_pos_out) {
|
||||||
|
|
||||||
auto ctx_clip = ctx_llava->ctx_clip;
|
auto ctx_clip = ctx_llava->ctx_clip;
|
||||||
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
|
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
|
||||||
|
@ -128,91 +123,3 @@ void llava_free(struct llava_context * ctx_llava) {
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static bool load_image(llava_context * ctx_llava, gpt_params * params, float **image_embd, int * n_image_pos) {
|
|
||||||
// load and preprocess the image
|
|
||||||
clip_image_u8 img;
|
|
||||||
auto prompt = params->prompt;
|
|
||||||
if (prompt_contains_image(prompt)) {
|
|
||||||
if (!params->image.empty()) {
|
|
||||||
printf("using base64 encoded image instead of command line image path\n");
|
|
||||||
}
|
|
||||||
if (!get_image_from_prompt(prompt, &img)) {
|
|
||||||
fprintf(stderr, "%s: can't load image from prompt\n", __func__);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
prompt = remove_image_from_prompt(prompt);
|
|
||||||
} else {
|
|
||||||
if (!clip_image_load_from_file(params->image.c_str(), &img)) {
|
|
||||||
fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
llava_build_img_embed(ctx_llava, params->n_threads, &img, image_embd, n_image_pos);
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void process_prompt(struct llava_context * ctx_llava, float * image_embd, int n_img_pos, gpt_params * params, const char * prompt) {
|
|
||||||
int n_past = 0;
|
|
||||||
|
|
||||||
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
|
||||||
|
|
||||||
// llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:"
|
|
||||||
// GG: are we sure that the should be a trailing whitespace at the end of this string?
|
|
||||||
eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER: ", params->n_batch, &n_past);
|
|
||||||
eval_image_embd(ctx_llava->ctx_llama, image_embd, n_img_pos, params->n_batch, &n_past);
|
|
||||||
eval_string(ctx_llava->ctx_llama, prompt, params->n_batch, &n_past);
|
|
||||||
eval_string(ctx_llava->ctx_llama, "\nASSISTANT:", params->n_batch, &n_past);
|
|
||||||
|
|
||||||
// generate the response
|
|
||||||
|
|
||||||
printf("\n");
|
|
||||||
|
|
||||||
for (int i = 0; i < max_tgt_len; i++) {
|
|
||||||
const char * tmp = sample(ctx_llava->ctx_llama, *params, &n_past);
|
|
||||||
if (strcmp(tmp, "</s>") == 0) break;
|
|
||||||
|
|
||||||
printf("%s", tmp);
|
|
||||||
fflush(stdout);
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("\n");
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
|
||||||
ggml_time_init();
|
|
||||||
|
|
||||||
gpt_params params;
|
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
|
||||||
show_additional_info(argc, argv);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
|
||||||
gpt_print_usage(argc, argv, params);
|
|
||||||
show_additional_info(argc, argv);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto ctx_llava = llava_init(¶ms);
|
|
||||||
if (ctx_llava == NULL) {
|
|
||||||
fprintf(stderr, "%s: error: failed to init llava\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
float * image_embd;
|
|
||||||
int n_image_pos;
|
|
||||||
load_image(ctx_llava, ¶ms, &image_embd, &n_image_pos);
|
|
||||||
|
|
||||||
// process the prompt
|
|
||||||
process_prompt(ctx_llava, image_embd, n_image_pos, ¶ms, params.prompt.c_str());
|
|
||||||
|
|
||||||
llama_print_timings(ctx_llava->ctx_llama);
|
|
||||||
|
|
||||||
free(image_embd);
|
|
||||||
llava_free(ctx_llava);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
#define LLAVA_H
|
#define LLAVA_H
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
struct clip_ctx;
|
struct clip_ctx;
|
||||||
|
|
||||||
|
@ -13,15 +14,12 @@ struct llava_context {
|
||||||
struct clip_ctx * ctx_clip = NULL;
|
struct clip_ctx * ctx_clip = NULL;
|
||||||
struct llama_context * ctx_llama = NULL;
|
struct llama_context * ctx_llama = NULL;
|
||||||
struct llama_model * model = NULL;
|
struct llama_model * model = NULL;
|
||||||
|
|
||||||
// int n_img_pos = 0;
|
|
||||||
// float * image_embd = NULL;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llava_context * llava_init(gpt_params * params);
|
struct llava_context * llava_init(gpt_params * params);
|
||||||
void llava_free(struct llava_context * ctx_llava);
|
void llava_free(struct llava_context * ctx_llava);
|
||||||
|
|
||||||
//void llava_process_prompt(struct llava_context * ctx_llava, gpt_params * params, const char * prompt);
|
bool llava_build_img_embed(struct llava_context * ctx_llava, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_image_pos_out);
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue