This commit is contained in:
Xuan Son Nguyen 2024-10-03 20:15:36 +02:00
parent 49e7304cdf
commit a88c0d5f26
8 changed files with 230 additions and 105 deletions

View file

@ -1474,6 +1474,27 @@ std::vector<llama_token> llama_tokenize(
return result;
}
// TODO: this function is hacky, need to be improved
std::vector<llama_token> llama_tokenize_with_img(
const struct llama_context * ctx,
const std::string & text,
bool add_special,
bool parse_special) {
static const std::string IMG_PLACEMENT = "<img_placement>";
std::vector<std::string> parts = string_split(text, IMG_PLACEMENT);
std::vector<llama_token> output;
for (const auto & part : parts) {
bool add_bos = &parts.front() == &part;
auto tokens = llama_tokenize(ctx, part, add_special && add_bos, parse_special);
output.insert(output.end(), tokens.begin(), tokens.end());
if (&parts.back() != &part) {
// add image token to middle of 2 parts
output.push_back(TOKEN_IMG_PLACEMENT);
}
}
return output;
}
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
std::string piece;
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'

View file

@ -378,6 +378,20 @@ static std::vector<T> string_split(const std::string & str, char delim) {
return values;
}
// split string by a `std::string delim` instead of `char delim`
static std::vector<std::string> string_split(std::string s, const std::string & delimiter) {
std::vector<std::string> tokens;
size_t pos = 0;
std::string token;
while ((pos = s.find(delimiter)) != std::string::npos) {
token = s.substr(0, pos);
tokens.push_back(token);
s.erase(0, pos + delimiter.length());
}
tokens.push_back(s);
return tokens;
}
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input);
@ -447,6 +461,17 @@ std::vector<llama_token> llama_tokenize(
bool add_special,
bool parse_special = false);
const llama_token TOKEN_IMG_PLACEMENT = -1000;
// tokenize with "placeholder" for image embedding tokens
// "<img_placement>" will be replaced with TOKEN_IMG_PLACEMENT
// TODO: this function is hacky, need to be improved
std::vector<llama_token> llama_tokenize_with_img(
const struct llama_context * ctx,
const std::string & text,
bool add_special,
bool parse_special = false);
// tokenizes a token into a piece, optionally renders special/control tokens
// should work similar to Python's `tokenizer.id_to_piece`
std::string llama_token_to_piece(

View file

@ -31,7 +31,7 @@ llama_img * load_image_from_file(const char * fname) {
// printf("\n");
// }
// printf("\n");
llama_img * result = llama_img_alloc(nx, ny);
llama_img * result = llama_img_init(nx, ny);
memcpy(result->data, img, nx*ny*3);
stbi_image_free(img);
return result;

View file

@ -15,7 +15,9 @@ static void print_usage(int, char ** argv) {
int main(int argc, char ** argv) {
gpt_params params;
params.prompt = "Hello my name is";
//params.prompt = "Hello my name is";
params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n"
"USER:<img_placement>\nwhat did you see?\nASSISTANT:";
params.n_predict = 32;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
@ -62,52 +64,10 @@ int main(int argc, char ** argv) {
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
// TODO: this is for testing; DELETE ME
int n_cur = 0;
params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
{
llama_img_batch ibatch;
ibatch.n_imgs = 1;
ibatch.imgs = (llama_img **) malloc(1024);
ibatch.imgs[0] = load_image_from_file("../models/eiffel-tower-3349075_1280.jpg");
llama_vision_encode(ctx, &ibatch);
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
int n_imgs = ibatch.n_imgs;
int n_embd = llama_n_embd(model);
int n_patches = llama_vision_n_patches(ctx);
printf("n_embd = %d ; n_patches = %d \n", n_embd, n_patches);
float * output_img = llama_vision_get_embeddings(ctx, 0);
n_cur += tokens.size();
llama_batch batch = llama_batch_init(512, 0, 1);
llama_batch_clear(batch);
for (auto t : tokens) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
if (llama_decode(ctx, batch) != 0) {
LOG("%s: llama_decode() failed\n", __func__);
return 1;
}
// for (int k = 0; k < 10; k++) printf("%f\n", output_img[k]);
llama_batch_clear(batch);
batch = {int32_t(n_patches*n_imgs), nullptr, output_img, nullptr, nullptr, nullptr, nullptr, n_cur, 1, 0, };
if (llama_decode(ctx, batch) != 0) {
LOG("%s: llama_decode() failed\n", __func__);
return 1;
}
n_cur += n_embd*n_imgs;
}
params.prompt = "\nwhat did you see?\nASSISTANT:";
// tokenize the prompt
std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
tokens_list = ::llama_tokenize_with_img(ctx, params.prompt, true);
const int n_ctx = llama_n_ctx(ctx);
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
@ -127,25 +87,75 @@ int main(int argc, char ** argv) {
LOG("\n");
for (auto id : tokens_list) {
LOG("%s", llama_token_to_piece(ctx, id).c_str());
if (id == TOKEN_IMG_PLACEMENT) {
LOG("<img_placement>");
} else {
LOG("%s", llama_token_to_piece(ctx, id).c_str());
}
}
LOG("\n\n");
// load image
llama_batch_img img_batch = llama_batch_img_init(1);
img_batch.imgs[0] = load_image_from_file("../models/eiffel-tower-3349075_1280.jpg");
// create a llama_batch with size 512
// we use this object to submit token data for decoding
llama_batch batch = llama_batch_init(512, 0, 1);
// evaluate the initial prompt
for (size_t i = 0; i < tokens_list.size(); i++) {
//llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
if (i == 0) continue;
llama_batch_add(batch, tokens_list[i], n_cur, { 0 }, false);
n_cur++;
int n_cur = 0;
int i_img = 0;
for (auto id : tokens_list) {
if (id == TOKEN_IMG_PLACEMENT) {
img_batch.pos[i_img] = n_cur;
n_cur += llama_img_n_tokens(ctx, img_batch.imgs[i_img]);
i_img++;
} else {
llama_batch_add(batch, id, n_cur, { 0 }, false);
printf("pos %d tok %d --> %s\n", n_cur, id, llama_token_to_piece(ctx, id).c_str());
n_cur++;
}
}
// llama_decode will output logits only for the last token of the prompt
batch.logits[batch.n_tokens - 1] = true;
if (llama_encode_vision(ctx, img_batch) != 0) {
LOG("%s: llama_encode_vision() failed\n", __func__);
return 1;
}
n_cur = 0;
{
auto t1 = ::llama_tokenize(ctx, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", false);
auto t2 = ::llama_tokenize(ctx, "\nwhat did you see?\nASSISTANT:", false);
t1.insert(t1.begin(), 1);
n_cur = 0;
llama_batch_clear(batch);
llama_batch_add(batch, 1, 0, { 0 }, false);
llama_decode(ctx, batch);
n_cur = t1.size();
llama_batch_clear(batch);
llama_batch batch0 = {int32_t(576), nullptr, _test_get_img_embd(ctx), nullptr, nullptr, nullptr, nullptr, n_cur, 1, 0, };
llama_decode(ctx, batch0);
n_cur = 0;
llama_batch_clear(batch);
for (auto t : t1) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
llama_decode(ctx, batch);
n_cur = t1.size() + 576;
llama_batch_clear(batch);
printf("pos %d\n", n_cur);
for (auto t : t2) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
batch.logits[batch.n_tokens - 1] = true;
}
if (llama_decode(ctx, batch) != 0) {
LOG("%s: llama_decode() failed\n", __func__);
return 1;
@ -153,7 +163,6 @@ int main(int argc, char ** argv) {
// main loop
//int n_cur = batch.n_tokens;
int n_decode = 0;
const auto t_main_start = ggml_time_us();

View file

@ -233,10 +233,11 @@ extern "C" {
} llama_img;
// Input data for llama_vision_decode
typedef struct llama_img_batch {
typedef struct llama_batch_img {
int32_t n_imgs;
llama_img ** imgs;
} llama_img_batch;
llama_pos * pos;
} llama_batch_img;
// Input data for llama_decode
// A llama_batch object can contain input about one or many sequences
@ -894,16 +895,18 @@ extern "C" {
//
// create new RGB image for input
LLAMA_API llama_img * llama_img_alloc(int width, int height);
LLAMA_API void llama_img_free(llama_img * img);
LLAMA_API llama_img * llama_img_init(int width, int height);
LLAMA_API void llama_img_free(llama_img * img);
// encode image into embeddings
LLAMA_API int32_t llama_vision_encode(struct llama_context * ctx, llama_img_batch * batch);
// get number of tokens that an image occupies, used to determine the position in the batch
LLAMA_API int32_t llama_img_n_tokens(struct llama_context * ctx, llama_img * img);
// get output embeddings, to be put into language batch
LLAMA_API float * llama_vision_get_embeddings(struct llama_context * ctx, int32_t idx);
// create new image batch
LLAMA_API llama_batch_img llama_batch_img_init(int n_imgs);
LLAMA_API void llama_batch_img_free(llama_batch_img batch);
LLAMA_API int32_t llama_vision_n_patches(struct llama_context * ctx);
// encode the input image batch
LLAMA_API int32_t llama_encode_vision(struct llama_context * ctx, llama_batch_img batch);
//
// Vocab
@ -1237,6 +1240,7 @@ extern "C" {
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
LLAMA_API float * _test_get_img_embd(struct llama_context * ctx);
#ifdef __cplusplus
}

View file

@ -78,16 +78,12 @@ int clip_n_patches(const clip_context & ctx) {
int clip_n_mmproj_embd(const clip_context & ctx) {
if (ctx.model->hparams.proj_type == CLIP_PROJECTOR_TYPE_MLP) {
return ctx.model->mm_b_b->ne[0];
return ctx.model->mm_2_b->ne[0];
} else {
GGML_ASSERT(false && "invalid proj type");
}
}
int clip_n_embd(const clip_context & ctx) {
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx);
}
/**
* Selects the best resolution from a list of possible resolutions based on the original size.
*
@ -575,12 +571,12 @@ static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size,
embeddings = ggml_get_rows(ctx0, embeddings, patches);
if (hparams.proj_type == CLIP_PROJECTOR_TYPE_MLP) {
embeddings = ggml_mul_mat(ctx0, model.mm_a_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_a_b);
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
embeddings = ggml_gelu(ctx0, embeddings);
embeddings = ggml_mul_mat(ctx0, model.mm_b_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_b_b);
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
} else {
GGML_ASSERT(false && "unsupported proj type");
}
@ -681,7 +677,9 @@ static int32_t clip_image_batch_encode(clip_context & ctx, const clip_image_f32_
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(ctx.sched, embeddings);
// copy the embeddings to the location passed by the user
output.resize(clip_n_embd(ctx));
size_t out_nbytes = clip_n_patches(ctx)*clip_n_mmproj_embd(ctx)*sizeof(float);
GGML_ASSERT(out_nbytes == ggml_nbytes(embeddings));
output.resize(out_nbytes);
ggml_backend_tensor_get_async(backend_embd, embeddings, output.data(), 0, ggml_nbytes(embeddings));
ggml_backend_sched_synchronize(ctx.sched);
@ -731,15 +729,18 @@ static int32_t encode_image_with_clip(clip_context & ctx, const llama_img img, s
////////////////////////////////////////////////////////////////////////////////////////
// public API
int32_t llama_vision_encode_internal(clip_context & ctx, llama_img_batch * batch) {
int32_t llama_encode_vision_internal(clip_context & ctx, llama_batch_img * batch) {
if (batch->n_imgs == 0) {
return 0;
}
// TODO: batching is not working atm, should be fixed later
const int n_embd = clip_n_embd(ctx);
ctx.output.resize(n_embd * batch->n_imgs);
ctx.n_output = batch->n_imgs;
const int n_embd = clip_n_mmproj_embd(ctx);
const int n_tokens_per_img = clip_n_patches(ctx);
const int n_pos = n_tokens_per_img*batch->n_imgs;
ctx.out_embd.resize(n_embd*n_pos);
ctx.out_pos.resize(n_pos);
for (int i = 0; i < batch->n_imgs; i++) {
std::vector<float> output_single;
@ -748,14 +749,23 @@ int32_t llama_vision_encode_internal(clip_context & ctx, llama_img_batch * batch
return status;
}
// copy output embeddings to result
for (int k = 0; k < n_embd; k++) {
ctx.output[n_embd*i + k] = output_single[k];
for (int k = 0; k < n_embd*n_tokens_per_img; k++) {
ctx.out_embd[n_embd*n_tokens_per_img*i + k] = output_single[k];
}
// fill position for all output tokens
for (int p = 0; p < n_tokens_per_img; p++) {
ctx.out_pos[n_tokens_per_img*i + p] = batch->pos[i] + p;
}
}
return 0;
}
void llama_vision_clear_output(clip_context & ctx) {
ctx.out_embd.clear();
ctx.out_pos.clear();
}
////////////////////////////////////////////////////////////////////////////////////////
// for debugging
#ifndef NDEBUG

View file

@ -95,10 +95,10 @@ struct clip_vision_model {
struct ggml_tensor * projection = NULL;
// LLaVA projection
struct ggml_tensor * mm_a_w = NULL;
struct ggml_tensor * mm_a_b = NULL;
struct ggml_tensor * mm_b_w = NULL;
struct ggml_tensor * mm_b_b = NULL;
struct ggml_tensor * mm_1_w = NULL;
struct ggml_tensor * mm_1_b = NULL;
struct ggml_tensor * mm_2_w = NULL;
struct ggml_tensor * mm_2_b = NULL;
struct ggml_tensor * image_newline = NULL;
};
@ -110,15 +110,15 @@ struct clip_context {
const clip_vision_model * model;
// temporary output data
int n_output;
std::vector<float> output; // size == n_output * n_embd
// temporary output data, to be picked up by llama_decode()
std::vector<float> out_embd; // size == n_tokens * n_embd
std::vector<llama_pos> out_pos; // position of each token
};
mm_patch_merge mm_patch_merge_from_name(std::string & name);
clip_projector_type projector_type_from_name(std::string & name);
int clip_n_patches(const clip_context & ctx);
int clip_n_mmproj_embd(const clip_context & ctx);
int clip_n_embd(const clip_context & ctx);
int32_t llama_vision_encode_internal(clip_context & ctx, llama_img_batch * batch);
int32_t llama_encode_vision_internal(clip_context & ctx, llama_batch_img * batch);
void llama_vision_clear_output(clip_context & ctx);

View file

@ -1600,7 +1600,7 @@ static const std::map<vision_arch, std::map<vision_tensor, std::string>> VISION_
{
VISION_ARCH_LLAVA,
{
{ VISION_TENSOR_MMPROJ, "v.mmproj" },
{ VISION_TENSOR_MMPROJ, "v.mmproj_%d" },
{ VISION_TENSOR_ENC_EMBD_CLS, "v.enc.embd.cls" },
{ VISION_TENSOR_ENC_EMBD_PATCH, "v.enc.embd.patch" },
{ VISION_TENSOR_ENC_EMBD_POS, "v.enc.embd.pos" },
@ -8990,10 +8990,10 @@ static bool llm_load_tensors(
switch (vparams.arch) {
case VISION_ARCH_LLAVA:
{
model.clip.mm_a_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_embd, n_ff});
model.clip.mm_a_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 1), {n_ff});
model.clip.mm_b_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_ff, n_ff});
model.clip.mm_b_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 2), {n_ff});
model.clip.mm_1_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_embd, n_ff});
model.clip.mm_1_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 1), {n_ff});
model.clip.mm_2_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_ff, n_ff});
model.clip.mm_2_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 2), {n_ff});
model.clip.class_embedding = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_CLS ), {n_embd});
model.clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_embd});
@ -9001,8 +9001,8 @@ static bool llm_load_tensors(
model.clip.pre_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "weight"), {n_embd});
model.clip.pre_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "bias" ), {n_embd});
// model.clip.post_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "weight"), {n_embd});
// model.clip.post_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "bias" ), {n_embd});
model.clip.post_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "weight"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
model.clip.post_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "bias" ), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_layer = ctx_for_layer(i);
@ -20110,6 +20110,8 @@ int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
void llama_kv_cache_clear(struct llama_context * ctx) {
llama_kv_cache_clear(ctx->kv_self);
// clear vision embeddings output
llama_vision_clear_output(ctx->clip);
}
bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
@ -21226,9 +21228,48 @@ int32_t llama_encode(
return ret;
}
float * _test_get_img_embd(struct llama_context * ctx) { return ctx->clip.out_embd.data(); }
int32_t llama_decode(
struct llama_context * ctx,
struct llama_batch batch) {
// hacky vision implementation, for testing only
if (!ctx->clip.out_embd.empty()) {
// int8_t * logits = new int8_t [ctx->clip.out_pos.size()];
// int32_t * n_seq_id = new int32_t [ctx->clip.out_pos.size()];
// llama_seq_id ** seq_id = new llama_seq_id *[ctx->clip.out_pos.size()];
// llama_seq_id seq_id_0 = 0;
// printf("out_pos %d\n", ctx->clip.out_pos.size());
// llama_batch ibatch = {
// /*n_tokens =*/ static_cast<int32_t>(ctx->clip.out_pos.size()),
// /*tokens =*/ nullptr,
// /*embd =*/ ctx->clip.out_embd.data(),
// /*pos =*/ ctx->clip.out_pos.data(),
// /*n_seq_id =*/ n_seq_id,
// /*seq_id =*/ seq_id,
// /*logits =*/ logits,
// /*all_pos_0 =*/ 0,
// /*all_pos_1 =*/ 0,
// /*all_seq_id =*/ 0,
// };
// for (size_t i = 0; i < ctx->clip.out_pos.size(); i++) {
// ibatch.n_seq_id[i] = 1;
// ibatch.seq_id [i] = &seq_id_0;
// ibatch.logits [i] = 0;
// }
// llama_decode_internal(*ctx, ibatch);
// delete[] logits;
// delete[] n_seq_id;
// delete[] seq_id;
// llama_vision_clear_output(ctx->clip);
//int n_eval = ctx->clip.out_pos.size();
//int n_past = ctx->clip.out_pos[0];
//printf("n_eval %d, n_past %d\n", n_eval, n_past);
//llama_batch ibatch = {int32_t(n_eval), nullptr, ctx->clip.out_embd.data(), nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
//llama_decode_internal(*ctx, ibatch);
//llama_vision_clear_output(ctx->clip);
}
const int ret = llama_decode_internal(*ctx, batch);
if (ret < 0) {
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
@ -21808,30 +21849,45 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * mod
// vision
//
llama_img * llama_img_alloc(int width, int height) {
llama_img * llama_img_init(int width, int height) {
llama_img * img = new llama_img();
img->nx = width;
img->ny = height;
img->data = (unsigned char *)malloc(width*height*3);
if (width > 0 && height > 0) {
img->data = (unsigned char *)malloc(width*height*3);
}
return img;
}
void llama_img_free(llama_img * img) {
free(img->data);
if (img->data) free(img->data);
delete img;
}
int32_t llama_vision_encode(struct llama_context * ctx, llama_img_batch * batch) {
return llama_vision_encode_internal(ctx->clip, batch);
}
float * llama_vision_get_embeddings(struct llama_context * ctx, int32_t idx) {
return ctx->clip.output.data();
}
int32_t llama_vision_n_patches(struct llama_context * ctx) {
int32_t llama_img_n_tokens(struct llama_context * ctx, llama_img * img) {
GGML_UNUSED(img); // reserved for future usage
return clip_n_patches(ctx->clip);
}
llama_batch_img llama_batch_img_init(int n_imgs) {
llama_batch_img batch;
batch.n_imgs = n_imgs;
if (n_imgs > 0) {
batch.imgs = (llama_img **)malloc(n_imgs*sizeof(llama_img *));
batch.pos = (llama_pos * )malloc(n_imgs*sizeof(llama_pos ));
}
return batch;
}
void llama_batch_img_free(llama_batch_img batch) {
if (batch.imgs) free(batch.imgs);
if (batch.pos ) free(batch.pos );
}
int32_t llama_encode_vision(struct llama_context * ctx, llama_batch_img batch) {
return llama_encode_vision_internal(ctx->clip, &batch);
}
//
// model split
//