diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp index 974f10edd..68805b236 100644 --- a/examples/llava/qwen2vl-cli.cpp +++ b/examples/llava/qwen2vl-cli.cpp @@ -19,12 +19,21 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { int N = (int) tokens.size(); + std::vector pos; for (int i = 0; i < N; i += n_batch) { int n_eval = (int) tokens.size() - i; if (n_eval > n_batch) { n_eval = n_batch; } - if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { + auto batch = llama_batch_get_one(&tokens[i], n_eval, *n_past, 0); + // TODO: add mrope pos ids somewhere else + pos.resize(batch.n_tokens * 3); + for (int j = 0; j < batch.n_tokens * 3; j ++) { + pos[j] = j % batch.n_tokens; + } + batch.pos = pos.data(); + + if (llama_decode(ctx_llama, batch)) { LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); return false; } @@ -296,9 +305,12 @@ static void tmp_test_conv2d_reshape(struct llava_context * ctx_llava, gpt_params ggml_set_input(inp_raw); auto image_pixels = batch_size * image_size_width * image_size_height * 3; + auto one_ch = image_size_width * image_size_height; std::vector dummy_img; dummy_img.resize(image_pixels); - std::fill(dummy_img.begin(), dummy_img.end(), 0.1); + std::fill(dummy_img.begin(), dummy_img.begin() + one_ch, 0.1); + std::fill(dummy_img.begin() + one_ch, dummy_img.begin() + one_ch * 2, 0.2); + std::fill(dummy_img.begin() + one_ch * 2, dummy_img.end(), 0.3); memcpy(inp_raw->data, dummy_img.data(), image_pixels * ggml_element_size(inp_raw)); int patch_size = 14; @@ -343,6 +355,105 @@ static void tmp_test_conv2d_reshape(struct llava_context * ctx_llava, gpt_params (float *) ggml_get_data(inp), sizeof(float) * num_patches * hidden_size * batch_size); ggml_free(ctx0); + + std::ofstream outFile("conv2d.bin", std::ios::binary); + if (outFile.is_open()) { + outFile.write(reinterpret_cast(embd.data()), embd.size() * sizeof(int)); + + outFile.close(); + std::cout << "Data successfully written to conv2d.bin" << std::endl; + } else { + std::cerr << "Error opening file!" << std::endl; + } +} + + +static void tmp_test_4d_reshape(struct llava_context * ctx_llava, gpt_params * params) { + int image_size_width = 32; + int image_size_height = 32; + int batch_size = 1; + + static size_t buf_size = 512u*1024*1024; + static void * buf = malloc(buf_size); + + struct ggml_init_params init_params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, + /*.no_alloc =*/ false, + }; + + struct ggml_context * ctx0 = ggml_init(init_params); + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * inp_raw = ggml_new_tensor_4d( + ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 8, batch_size); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + + auto image_pixels = batch_size * image_size_width * image_size_height * 8; + auto one_ch = image_size_width * image_size_height; + std::vector dummy_img; + dummy_img.resize(image_pixels); + for (int i = 0; i < 8; i++) + { + // std::fill( + // dummy_img.begin() + one_ch * i, + // dummy_img.begin() + one_ch * (i + 1), + // 0.1 * i + // ); + for (size_t y = 0; y < image_size_height; y++) + { + for (size_t x = 0; x < image_size_width; x++) + { + dummy_img[one_ch * i + image_size_width * y + x] = i * (image_size_width * y + x) / (float)(32 * 32); + } + + } + + } + memcpy(inp_raw->data, dummy_img.data(), image_pixels * ggml_element_size(inp_raw)); + + int patch_size = 1; + int hidden_size = 8; + int patch_w = image_size_width / patch_size; + int patch_h = image_size_height / patch_size; + int num_patches = (image_size_width / patch_size) * (image_size_height / patch_size); + + // inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); + // inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); // swap axis 0 & 1, ignore axis 3 which is empty in this tensor + // auto inp = ggml_cont(ctx0, ggml_permute(ctx0, inp_raw, 2, 0, 1, 3)); // [w, h, c, b] -> [c, w, h, b] + auto inp = ggml_cont(ctx0, ggml_permute(ctx0, inp_raw, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b] [(0-->1), (1-->2), (2-->0), (3-->3)] + inp = ggml_reshape_4d( + ctx0, inp, + hidden_size * 2, patch_w / 2, patch_h, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + hidden_size * 2, patch_w / 2, 2, batch_size * (patch_h / 2)); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3)); + inp = ggml_reshape_2d( + ctx0, inp, + hidden_size * 4, (patch_w / 2) * batch_size * (patch_h / 2)); + + ggml_build_forward_expand(gf, inp); + ggml_graph_compute_with_ctx(ctx0, gf, 2); + + std::vector embd; + embd.resize(num_patches * hidden_size * batch_size); + memcpy( + embd.data(), + (float *) ggml_get_data(inp), + sizeof(float) * num_patches * hidden_size * batch_size); + ggml_free(ctx0); + + std::ofstream outFile("reshape_4d.bin", std::ios::binary); + if (outFile.is_open()) { + outFile.write(reinterpret_cast(embd.data()), embd.size() * sizeof(int)); + + outFile.close(); + std::cout << "Data successfully written to reshape_4d.bin" << std::endl; + } else { + std::cerr << "Error opening file!" << std::endl; + } } @@ -582,11 +693,11 @@ int main(int argc, char ** argv) { auto ctx_llava = llava_init_context(¶ms, model); // process the prompt - // tmp_test_conv2d_reshape(ctx_llava, ¶ms); + tmp_test_4d_reshape(ctx_llava, ¶ms); // tmp_test_rope(ctx_llava, ¶ms); // tmp_test_mrope(ctx_llava, ¶ms); - tmp_test_mrope_2d(ctx_llava, ¶ms); - process_prompt(ctx_llava, nullptr, ¶ms, params.prompt); + // tmp_test_mrope_2d(ctx_llava, ¶ms); + // process_prompt(ctx_llava, nullptr, ¶ms, params.prompt); llama_perf_context_print(ctx_llava->ctx_llama); ctx_llava->model = NULL;