From e02c45c63b91bac52397d8264030413f99721cb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Tue, 24 Sep 2024 22:24:53 +0200 Subject: [PATCH 1/8] examples: add compression example --- common/arg.cpp | 20 +- common/common.h | 4 + examples/CMakeLists.txt | 1 + examples/compress/CMakeLists.txt | 5 + examples/compress/README.md | 3 + examples/compress/compress.cpp | 643 +++++++++++++++++++++++++++++++ 6 files changed, 675 insertions(+), 1 deletion(-) create mode 100644 examples/compress/CMakeLists.txt create mode 100644 examples/compress/README.md create mode 100644 examples/compress/compress.cpp diff --git a/common/arg.cpp b/common/arg.cpp index c1ec3c4f9..b1b9f4792 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1633,7 +1633,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.cvector_outfile = value; params.lora_outfile = value; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_COMPRESS})); add_opt(llama_arg( {"-ofreq", "--output-frequency"}, "N", format("output the imatrix every N iterations (default: %d)", params.n_out_freq), @@ -1938,6 +1938,24 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, else { std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"--compression_header_size"}, "N", + "Number of tokens to keep in header (default: 1)", + [](gpt_params & params, int value){ + params.num_tokens_header = value; + }).set_examples({LLAMA_EXAMPLE_COMPRESS})); + add_opt(llama_arg( + {"--mode"}, "{compress,expand,test}", + "What task to run (default: test)", + [](gpt_params & params, const std::string & value){ + if (value == "test"){ + return; } + else if (value == "compress"){ + params.compress_mode = 1; } + else if (value == "expand"){ + params.compress_mode = 2; } + else { std::invalid_argument("invalid value"); } + }).set_examples({LLAMA_EXAMPLE_COMPRESS})); add_opt(llama_arg( {"--log-disable"}, "Log disable", diff --git a/common/common.h b/common/common.h index cb87c4479..97a2d0919 100644 --- a/common/common.h +++ b/common/common.h @@ -80,6 +80,7 @@ enum llama_example { LLAMA_EXAMPLE_PARALLEL, LLAMA_EXAMPLE_COUNT, + LLAMA_EXAMPLE_COMPRESS }; enum gpt_sampler_type { @@ -340,6 +341,9 @@ struct gpt_params { // batched-bench params bool batched_bench_output_jsonl = false; + + int num_tokens_header = 1; + int compress_mode = 0; }; // call once at the start of a program if it uses libcommon diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 67b3d2774..acc0edccf 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -17,6 +17,7 @@ else() add_subdirectory(batched-bench) add_subdirectory(batched) add_subdirectory(benchmark) + add_subdirectory(compress) add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) add_subdirectory(eval-callback) diff --git a/examples/compress/CMakeLists.txt b/examples/compress/CMakeLists.txt new file mode 100644 index 000000000..677dac8fd --- /dev/null +++ b/examples/compress/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-compress) +add_executable(${TARGET} compress.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/compress/README.md b/examples/compress/README.md new file mode 100644 index 000000000..3b461e143 --- /dev/null +++ b/examples/compress/README.md @@ -0,0 +1,3 @@ +# llama.cpp/examples/compress + +Demonstration of LLM-based natural language compression. diff --git a/examples/compress/compress.cpp b/examples/compress/compress.cpp new file mode 100644 index 000000000..e27ab2b24 --- /dev/null +++ b/examples/compress/compress.cpp @@ -0,0 +1,643 @@ +#include "arg.h" +#include "common.h" +#include "sampling.h" +#include "sampling.cpp" +#include "log.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include + +int msb_log2(int x) +{ + int ret = 0; + while (x > 0) + { + ret++; + x >>= 1; + } + return ret; +} + +int msB_log256(int x) +{ + int ret = 0; + while (x > 0) + { + ret++; + x >>= 8; + } + return ret; +} + +const int block_header_size = 2; +const int fixed_token_cost = 1; + +std::vector encode(llama_context *ctx, std::vector inp, gpt_sampler *smpl, int num_raw_tokens_header) +{ + + llama_batch batch = llama_batch_init(inp.size(), 0, 1); + + for (size_t i = 0; i < num_raw_tokens_header; i++) + { + llama_batch_add(batch, inp[i], i, {0}, true); + } + + // eval the first few tokens of the prompt + if (llama_decode(ctx, batch)) + { + LOG_ERR("%s: llama_decode() failed\n", __func__); + exit(1); + } + + const auto t_enc_end = ggml_time_us(); + + std::vector sample_ids; + + smpl->set_logits(ctx, num_raw_tokens_header - 1); + for (int index = num_raw_tokens_header; index < inp.size(); index++) + { + auto &cur_p = smpl->cur_p; // initialized by set_logits + // llama_sampler_apply(smpl->grmr, &cur_p); + llama_sampler_apply(smpl->chain, &cur_p); + + int match = -1; + for (int i = 0; i < cur_p.size; i++) + { + auto tok = cur_p.data[i]; + llama_token candidate = tok.id; + if (candidate == inp[index]) + { + LOG("%s", llama_token_to_piece(ctx, candidate)); + match = i; + break; + } + } + if(match<0){ + LOG_ERR("\n couldn't match %s", llama_token_to_piece(ctx, inp[index])); + exit(1); + } + sample_ids.push_back(match); + llama_batch_clear(batch); + llama_batch_add(batch, inp[index], index, {0}, true); + if (llama_decode(ctx, batch)) + { + LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); + exit(1); + } + smpl->set_logits(ctx, 0); + } + + // bit pack sample_ids + std::vector sample_ids_bitpacked; + + int bit_offset = 0; + uint8_t current = 0; + + int block_start = 0; + bool build_block = true; + bool was_block = false; + + // frst put the raw first few tokens + sample_ids_bitpacked.push_back(num_raw_tokens_header); + for (size_t i = 0; i < num_raw_tokens_header; i++) + { + // pack 4 bytes + for (int j = 0; j < 4; j++) + { + uint8_t byte = inp[i] >> (j * 8); + sample_ids_bitpacked.push_back(byte); + } + } + block_start = 1 + num_raw_tokens_header * 4; + bit_offset = block_start * 8; + + for (int i = 0; i < sample_ids.size(); i++) + { + int sample_id = sample_ids[i]; + uint8_t PAD = (8 - bit_offset % 8) % 8; + uint8_t bytesize = (uint8_t)msB_log256(sample_id); + // LOG("pos: %d, bs: %d\n",sample_id, bytesize); + + // Big number, better save as token + if (sample_id > PAD + (block_header_size + fixed_token_cost + bytesize) * 8) + { + // LOG("End block\n"); + // Close current block (0b1010 is block marker) + if (was_block) + { + sample_ids_bitpacked[block_start] = 0b10100000 | PAD; + int block_size = (bit_offset + PAD) / 8 - block_start; + if (block_size >= 256) + { + // TODO: figure it out + LOG_ERR("OOPS"); + exit(-1); + } + sample_ids_bitpacked[block_start + 1] = block_size & 0xff; + + // TODO: handle more than 256 bits of block data (multiple blocks or bigger header?) + // sample_ids_bitpacked[block_start + 2] = block_size >> 8; + + // put last bytes + if (PAD) + { + sample_ids_bitpacked.push_back(current); + current = 0; + } + } + bit_offset += PAD; + if (bit_offset % 8) + { + LOG_ERR("Unreachable"); + exit(-1); + } + // LOG("\n%d",bit_offset/8); + // 0b0101 is token marker + + sample_ids_bitpacked.push_back(0b01010000 | bytesize); + // put token bytes into sample_ids_bitpacked + // LOG("\n%d -> ",sample_id); + for (int j = 0; j < bytesize; j++) + { + sample_ids_bitpacked.push_back(sample_id & 0xff); + LOG("%02x ", sample_id & 0xff); + sample_id >>= 8; + } + if (sample_id) + LOG("Shouldn't happen"); + bit_offset += 8 * (fixed_token_cost + bytesize); + build_block = true; + was_block = false; + continue; + } + was_block = true; + if (build_block) + { + if (bit_offset % 8) + { + LOG_ERR("Unreachable"); + exit(-1); + } + build_block = false; + block_start = bit_offset / 8; + for (int j = 0; j < block_header_size; j++) + { + sample_ids_bitpacked.push_back(0); + } + bit_offset += 8 * block_header_size; + } + for (int j = 0; j < sample_id; j++) + { + current |= 1 << (7 - bit_offset % 8); + bit_offset++; + if (bit_offset % 8 == 0) + { + sample_ids_bitpacked.push_back(current); + current = 0; + } + } + bit_offset++; + if (bit_offset % 8 == 0) + { + sample_ids_bitpacked.push_back(current); + current = 0; + } + } + if (!build_block) + { + if (bit_offset % 8) + sample_ids_bitpacked.push_back(current); + uint8_t PAD = (8 - bit_offset % 8) % 8; + sample_ids_bitpacked[block_start] = 0b10100000 | PAD; + int block_size = (bit_offset + PAD) / 8 - block_start; + // endianness: big endian + sample_ids_bitpacked[block_start + 1] = block_size & 0xff; + } + llama_batch_free(batch); + return sample_ids_bitpacked; +} + +std::vector decode(llama_context *ctx, gpt_sampler *smpl, std::vector sample_ids_bitpacked, std::vector inp = {}) +{ + std::vector out; + + llama_batch batch = llama_batch_init(512, 0, 1); + + int num_raw_tokens_header = sample_ids_bitpacked[0]; + + for (size_t i = 0; i < num_raw_tokens_header; i++) + { + // unpack 4 bytes + llama_token token = 0; + for (int j = 3; j >= 0; j--) + { + token <<= 8; + token |= sample_ids_bitpacked[1 + i * 4 + j]; + } + + llama_batch_add(batch, token, i, {0}, true); + out.push_back(token); + auto token_str = llama_token_to_piece(ctx, token); + LOG("%s", token_str.c_str()); + } + LOG("\u001b[0m\u001b[37m"); + if (llama_decode(ctx, batch)) + { + LOG_ERR("%s: llama_decode() failed\n", __func__); + exit(1); + } + + smpl->set_logits(ctx, num_raw_tokens_header - 1); + + int index = 0; + int bit_index = (1 + num_raw_tokens_header * 4) * 8; + const int bitsize = sample_ids_bitpacked.size() * 8; + while (bit_index < bitsize) + { + + uint8_t header = sample_ids_bitpacked[bit_index / 8]; + if (header & 0b01010000) + { + uint8_t bytesize = header & 0x0f; + // it's a token + + int sample_id = 0; + for (int i = bytesize; i > 0; i--) + { + sample_id <<= 8; + sample_id |= (int)sample_ids_bitpacked[i + (bit_index / 8)]; + } + + auto &cur_p = smpl->cur_p; // initialized by set_logits + llama_sampler_apply(smpl->chain, &cur_p); + auto token_id = cur_p.data[sample_id].id; + + out.push_back(token_id); + + if (!inp.size() || token_id == inp[num_raw_tokens_header + index]) + { + LOG("%s", llama_token_to_piece(ctx, token_id).c_str()); + } + else + { + // print in red + LOG("\u001b[31m%s", llama_token_to_piece(ctx, token_id).c_str()); + LOG("\nExpected: %s", llama_token_to_piece(ctx, inp[num_raw_tokens_header + index]).c_str()); + // LOG("\n%d", num_raw_tokens_header + index); + LOG("\n, Id: %d != %d", token_id, inp[num_raw_tokens_header + index]); + LOG("\nPos: %d, bs:%d", sample_id, bytesize); + + // print sample_id bytes in hex + // LOG("\n %02x %02x", sample_ids_bitpacked[bit_index / 8], sample_ids_bitpacked[bit_index / 8 + 1]); + LOG("\n"); + for (int i = bytesize; i > 0; i--) + { + LOG("%02x ", sample_ids_bitpacked[i + (bit_index / 8)]); + } + exit(-1); + } + + llama_batch_clear(batch); + llama_batch_add(batch, token_id, num_raw_tokens_header + index, {0}, true); + if (llama_decode(ctx, batch)) + { + LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); + exit(1); + } + smpl->set_logits(ctx, 0); + index++; + + bit_index += 8 * (fixed_token_cost + bytesize); + } + else + { + // it's a block + uint8_t PAD = header & 0x0f; + int block_size = sample_ids_bitpacked[bit_index / 8 + 1]; + int block_end = block_size * 8 + bit_index; + bit_index += 8 * block_header_size; + int id = 0; + for (; bit_index < block_end - PAD; bit_index++) + { + bool bit = sample_ids_bitpacked[bit_index / 8] & (1 << (7 - bit_index % 8)); + if (bit) + { + id++; + } + else + { + { + int sample_id = id; + + auto &cur_p = smpl->cur_p; // initialized by set_logits + // llama_sampler_apply(smpl->grmr, &cur_p); + llama_sampler_apply(smpl->chain, &cur_p); + auto token_id = cur_p.data[sample_id].id; + out.push_back(token_id); + if (!inp.size() || token_id == inp[num_raw_tokens_header + index]) + { + LOG("%s", llama_token_to_piece(ctx, token_id).c_str()); + } + else + { + // print in red + LOG("\u001b[31m%s", llama_token_to_piece(ctx, token_id).c_str()); + } + + llama_batch_clear(batch); + llama_batch_add(batch, token_id, num_raw_tokens_header + index, {0}, true); + if (llama_decode(ctx, batch)) + { + LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); + exit(1); + } + smpl->set_logits(ctx, 0); + } + index++; + + id = 0; + } + } + // LOG("\n(%d+%d)/8= %d\n",bit_index,PAD,(bit_index+PAD)/8); + bit_index += PAD; + } + } + + llama_batch_free(batch); + return out; +} + +void test(const gpt_params ¶ms) +{ + int num_raw_tokens_header = params.num_tokens_header; + llama_model_params model_params = llama_model_params_from_gpt_params(params); + llama_model *model = llama_load_model_from_file(params.model.c_str(), model_params); + + llama_context_params ctx_params = llama_context_params_from_gpt_params(params); + llama_context *ctx = llama_new_context_with_model(model, ctx_params); + + // Tokenize the prompt + std::vector inp; + + inp = ::llama_tokenize(ctx, params.prompt, false, false); + + // num_raw_tokens_header = inp.size(); + assert(inp.size() > num_raw_tokens_header); + + const int max_context_size = llama_n_ctx(ctx); + const int max_tokens_list_size = max_context_size - 4; + + if ((int)inp.size() > max_tokens_list_size) + { + LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int)inp.size(), max_tokens_list_size); + exit(1); + } + + LOG("\n\n"); + + int i = 0; + for (auto id : inp) + { + LOG("%s", llama_token_to_piece(ctx, id).c_str()); + if (++i >= num_raw_tokens_header) + break; + } + + fflush(stderr); + + // encode stage + + const auto t_enc_start = ggml_time_us(); + + struct gpt_sampler *smpl = gpt_sampler_init(model, params.sparams); + + std::vector sample_ids_bitpacked = encode(ctx, inp, smpl, num_raw_tokens_header); + + gpt_sampler_free(smpl); + auto t_enc_end = ggml_time_us(); + + LOG("\n"); + + // print bits as binary to debug + for (int i = 0; i < sample_ids_bitpacked.size(); i++) + { + std::bitset<8> x(sample_ids_bitpacked[i]); + LOG("%s ", x.to_string().c_str()); + } + LOG("\n"); + + // print as hexadecimal + for (int i = 0; i < sample_ids_bitpacked.size(); i++) + { + LOG("%02X ", sample_ids_bitpacked[i]); + } + LOG("\n"); + + LOG("\nInput: %d characters (%d tokens)", params.prompt.length(), inp.size()); + + float compressed_byte_per_token = (float)sample_ids_bitpacked.size() / (float)inp.size(); + float compressed_bits_per_char = 8 * (float)sample_ids_bitpacked.size() / (float)params.prompt.length(); + + LOG("\n%d compressed bytes,(%04f bytes per token, %04f bits per character)\n", (int)sample_ids_bitpacked.size(), compressed_byte_per_token, compressed_bits_per_char); + + llama_free(ctx); + ctx = llama_new_context_with_model(model, ctx_params); + + LOG("\n------------\n"); + + // decode stage + + const auto t_dec_start = ggml_time_us(); + + smpl = gpt_sampler_init(model, params.sparams); + decode(ctx, smpl, sample_ids_bitpacked, inp); + + auto t_dec_end = ggml_time_us(); + + LOG("\n\n"); + + LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", inp.size(), (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); + + LOG_INF("\n"); + LOG_INF("\n"); + + LOG_INF("\n"); + gpt_perf_print(ctx, smpl); + + gpt_sampler_free(smpl); + + llama_free(ctx); + llama_free_model(model); +} + +int main(int argc, char **argv) +{ + gpt_params params; + + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPRESS)) + { + return 1; + } + + // TODO: change defaults instead? + params.sparams.min_p = 0; + params.sparams.top_p = 1; + params.sparams.top_k = -1; + params.sparams.temp = 0; + + gpt_init(); + + // init llama.cpp + llama_backend_init(); + llama_numa_init(params.numa); + + // TODO: use Enum? + if (params.compress_mode == 0) + { + test(params); + } + else if (params.compress_mode == 1) + { // compress + llama_model_params model_params = llama_model_params_from_gpt_params(params); + llama_model *model = llama_load_model_from_file(params.model.c_str(), model_params); + + llama_context_params ctx_params = llama_context_params_from_gpt_params(params); + llama_context *ctx = llama_new_context_with_model(model, ctx_params); + + // Tokenize the prompt + std::vector inp; + + inp = ::llama_tokenize(ctx, params.prompt, false, false); + + assert(inp.size() > params.num_tokens_header); + + const int max_context_size = llama_n_ctx(ctx); + const int max_tokens_list_size = max_context_size - 4; + + if ((int)inp.size() > max_tokens_list_size) + { + LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int)inp.size(), max_tokens_list_size); + return 1; + } + + // Eval the start of the prompt + int i = 0; + for (auto id : inp) + { + LOG("%s", llama_token_to_piece(ctx, id).c_str()); + if (++i >= params.num_tokens_header) + break; + } + + fflush(stderr); + + // encode stage + + const auto t_enc_start = ggml_time_us(); + + struct gpt_sampler *smpl = gpt_sampler_init(model, params.sparams); + + std::vector sample_ids_bitpacked = encode(ctx, inp, smpl, params.num_tokens_header); + + gpt_sampler_free(smpl); + llama_free(ctx); + llama_free_model(model); + auto t_enc_end = ggml_time_us(); + + LOG("\n"); + if(!params.no_perf){ + LOG("\nInput: %d characters (%d tokens)", params.prompt.length(), inp.size()); + + float compressed_byte_per_token = (float)sample_ids_bitpacked.size() / (float)inp.size(); + float compressed_bits_per_char = 8 * (float)sample_ids_bitpacked.size() / (float)params.prompt.length(); + + LOG("\n%d compressed bytes,(%04f bytes per token, %04f bits per character)\n", (int)sample_ids_bitpacked.size(), compressed_byte_per_token, compressed_bits_per_char); + } + //maybe this needs to be changed + if(params.out_file != "imatrix.dat"){ + // dump uint8array to bin file + std::ofstream ofs(params.out_file.c_str(), std::ios::binary); + ofs.write((char*)&sample_ids_bitpacked[0], sample_ids_bitpacked.size()); + ofs.close(); + }else{ + LOG("\n------------\n"); + //print as hex to stdout + for (int i = 0; i < sample_ids_bitpacked.size(); i++){ + LOG("%02X ", sample_ids_bitpacked[i]); + } + } + + } + else if (params.compress_mode == 2) + { + //decompress mode + // load sample_ids_bitpacked from params.prompt_file + std::ifstream ifs(params.prompt_file.c_str(), std::ios::binary); + + if (!ifs) { + LOG_ERR("%s: failed to open file\n", __func__); + return -1; + } + // Get the ifs size + ifs.seekg(0, std::ios::end); + std::streampos fileSize = ifs.tellg(); + ifs.seekg(0, std::ios::beg); + + // Reserve space in the vector + std::vector sample_ids_bitpacked(fileSize); + + // Read the ifs into the vector + if (!ifs.read(reinterpret_cast(sample_ids_bitpacked.data()), fileSize)) { + LOG_ERR("%s: failed to read file\n", __func__); + return -1; + } + ifs.close(); + + //Debug: print as hex + for (int i = 0; i < sample_ids_bitpacked.size(); i++){ + LOG("%02X ", sample_ids_bitpacked[i]); + } + LOG("\n"); + + llama_model_params model_params = llama_model_params_from_gpt_params(params); + llama_model *model = llama_load_model_from_file(params.model.c_str(), model_params); + + llama_context_params ctx_params = llama_context_params_from_gpt_params(params); + llama_context *ctx = llama_new_context_with_model(model, ctx_params); + + const auto t_dec_start = ggml_time_us(); + + struct gpt_sampler *smpl = gpt_sampler_init(model, params.sparams); + + std::vector out = decode(ctx, smpl, sample_ids_bitpacked); + + + gpt_sampler_free(smpl); + auto t_dec_end = ggml_time_us(); + + //maybe this needs to be changed + if(params.out_file != "imatrix.dat"){ + // dump as string to file + std::string out_str = ::llama_detokenize(ctx, out); + + std::ofstream ofs(params.out_file.c_str(), std::ios::binary); + ofs.write((char*)&out_str[0], out_str.size()); + ofs.close(); + } + + llama_free(ctx); + llama_free_model(model); + + } + + llama_backend_free(); + + LOG("\n\n"); + + return 0; +} From 1146007610af1e8fc8f2e9e4c708cad85d13ff34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Tue, 24 Sep 2024 23:52:00 +0200 Subject: [PATCH 2/8] compress: fix sampling problem introduced by b0f27361f3539a81d983a8b045f3c61e682d9fc0 --- examples/compress/compress.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/compress/compress.cpp b/examples/compress/compress.cpp index e27ab2b24..b800ab645 100644 --- a/examples/compress/compress.cpp +++ b/examples/compress/compress.cpp @@ -488,7 +488,8 @@ int main(int argc, char **argv) params.sparams.min_p = 0; params.sparams.top_p = 1; params.sparams.top_k = -1; - params.sparams.temp = 0; + // Avoid temp=0 because greedy sampling breaks stuff + params.sparams.temp = 1.; gpt_init(); From bd5b24e8b6705a8bfd0e45b508712b1b0dc622aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Tue, 24 Sep 2024 23:52:09 +0200 Subject: [PATCH 3/8] compress: cleanup --- examples/compress/compress.cpp | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/examples/compress/compress.cpp b/examples/compress/compress.cpp index b800ab645..736a5bf6a 100644 --- a/examples/compress/compress.cpp +++ b/examples/compress/compress.cpp @@ -37,6 +37,8 @@ int msB_log256(int x) const int block_header_size = 2; const int fixed_token_cost = 1; +int total_pad = 0; + std::vector encode(llama_context *ctx, std::vector inp, gpt_sampler *smpl, int num_raw_tokens_header) { @@ -62,7 +64,6 @@ std::vector encode(llama_context *ctx, std::vector inp, gp for (int index = num_raw_tokens_header; index < inp.size(); index++) { auto &cur_p = smpl->cur_p; // initialized by set_logits - // llama_sampler_apply(smpl->grmr, &cur_p); llama_sampler_apply(smpl->chain, &cur_p); int match = -1; @@ -121,12 +122,10 @@ std::vector encode(llama_context *ctx, std::vector inp, gp int sample_id = sample_ids[i]; uint8_t PAD = (8 - bit_offset % 8) % 8; uint8_t bytesize = (uint8_t)msB_log256(sample_id); - // LOG("pos: %d, bs: %d\n",sample_id, bytesize); // Big number, better save as token if (sample_id > PAD + (block_header_size + fixed_token_cost + bytesize) * 8) { - // LOG("End block\n"); // Close current block (0b1010 is block marker) if (was_block) { @@ -151,21 +150,18 @@ std::vector encode(llama_context *ctx, std::vector inp, gp } } bit_offset += PAD; + total_pad += PAD; if (bit_offset % 8) { LOG_ERR("Unreachable"); exit(-1); } - // LOG("\n%d",bit_offset/8); // 0b0101 is token marker - sample_ids_bitpacked.push_back(0b01010000 | bytesize); // put token bytes into sample_ids_bitpacked - // LOG("\n%d -> ",sample_id); for (int j = 0; j < bytesize; j++) { sample_ids_bitpacked.push_back(sample_id & 0xff); - LOG("%02x ", sample_id & 0xff); sample_id >>= 8; } if (sample_id) @@ -217,6 +213,7 @@ std::vector encode(llama_context *ctx, std::vector inp, gp int block_size = (bit_offset + PAD) / 8 - block_start; // endianness: big endian sample_ids_bitpacked[block_start + 1] = block_size & 0xff; + total_pad+=PAD; } llama_batch_free(batch); return sample_ids_bitpacked; @@ -245,7 +242,6 @@ std::vector decode(llama_context *ctx, gpt_sampler *smpl, std::vect auto token_str = llama_token_to_piece(ctx, token); LOG("%s", token_str.c_str()); } - LOG("\u001b[0m\u001b[37m"); if (llama_decode(ctx, batch)) { LOG_ERR("%s: llama_decode() failed\n", __func__); @@ -275,6 +271,7 @@ std::vector decode(llama_context *ctx, gpt_sampler *smpl, std::vect auto &cur_p = smpl->cur_p; // initialized by set_logits llama_sampler_apply(smpl->chain, &cur_p); + auto token_id = cur_p.data[sample_id].id; out.push_back(token_id); @@ -288,12 +285,10 @@ std::vector decode(llama_context *ctx, gpt_sampler *smpl, std::vect // print in red LOG("\u001b[31m%s", llama_token_to_piece(ctx, token_id).c_str()); LOG("\nExpected: %s", llama_token_to_piece(ctx, inp[num_raw_tokens_header + index]).c_str()); - // LOG("\n%d", num_raw_tokens_header + index); LOG("\n, Id: %d != %d", token_id, inp[num_raw_tokens_header + index]); LOG("\nPos: %d, bs:%d", sample_id, bytesize); // print sample_id bytes in hex - // LOG("\n %02x %02x", sample_ids_bitpacked[bit_index / 8], sample_ids_bitpacked[bit_index / 8 + 1]); LOG("\n"); for (int i = bytesize; i > 0; i--) { @@ -335,8 +330,8 @@ std::vector decode(llama_context *ctx, gpt_sampler *smpl, std::vect int sample_id = id; auto &cur_p = smpl->cur_p; // initialized by set_logits - // llama_sampler_apply(smpl->grmr, &cur_p); llama_sampler_apply(smpl->chain, &cur_p); + auto token_id = cur_p.data[sample_id].id; out.push_back(token_id); if (!inp.size() || token_id == inp[num_raw_tokens_header + index]) @@ -363,7 +358,6 @@ std::vector decode(llama_context *ctx, gpt_sampler *smpl, std::vect id = 0; } } - // LOG("\n(%d+%d)/8= %d\n",bit_index,PAD,(bit_index+PAD)/8); bit_index += PAD; } } @@ -554,10 +548,12 @@ int main(int argc, char **argv) if(!params.no_perf){ LOG("\nInput: %d characters (%d tokens)", params.prompt.length(), inp.size()); - float compressed_byte_per_token = (float)sample_ids_bitpacked.size() / (float)inp.size(); + float compressed_bits_per_token = 8 * (float)sample_ids_bitpacked.size() / (float)inp.size(); float compressed_bits_per_char = 8 * (float)sample_ids_bitpacked.size() / (float)params.prompt.length(); - LOG("\n%d compressed bytes,(%04f bytes per token, %04f bits per character)\n", (int)sample_ids_bitpacked.size(), compressed_byte_per_token, compressed_bits_per_char); + LOG("\n%d compressed bytes,(%04f bits per token, %04f bits per character)\n", (int)sample_ids_bitpacked.size(), compressed_bits_per_token, compressed_bits_per_char); + LOG("\n%d padding bits, (%04f bits per character without padding)", total_pad, compressed_bits_per_char - total_pad/(float)params.prompt.length()); + LOG("\nPPL (over)estimation: %04f (%04f with padding)", exp2(compressed_bits_per_token-total_pad/(float)inp.size()),exp2(compressed_bits_per_token)); } //maybe this needs to be changed if(params.out_file != "imatrix.dat"){ @@ -630,7 +626,7 @@ int main(int argc, char **argv) ofs.write((char*)&out_str[0], out_str.size()); ofs.close(); } - + llama_free(ctx); llama_free_model(model); From 77dd5d05a52f34ac6e267f13f7464ab6c9020f27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 25 Sep 2024 00:03:39 +0200 Subject: [PATCH 4/8] compress: update comment --- examples/compress/compress.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/compress/compress.cpp b/examples/compress/compress.cpp index 736a5bf6a..7636019e2 100644 --- a/examples/compress/compress.cpp +++ b/examples/compress/compress.cpp @@ -133,14 +133,13 @@ std::vector encode(llama_context *ctx, std::vector inp, gp int block_size = (bit_offset + PAD) / 8 - block_start; if (block_size >= 256) { - // TODO: figure it out - LOG_ERR("OOPS"); + // TODO: handle more than 256 bytes of block data + // (maybe allow multiple blocks in a row) + LOG_ERR("Block too big %d >= 256", block_size); exit(-1); } sample_ids_bitpacked[block_start + 1] = block_size & 0xff; - // TODO: handle more than 256 bits of block data (multiple blocks or bigger header?) - // sample_ids_bitpacked[block_start + 2] = block_size >> 8; // put last bytes if (PAD) From b9a32f464f8528476d6af7a837bb8fd8ce3a977f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 25 Sep 2024 01:20:53 +0200 Subject: [PATCH 5/8] compress: Fix missing c_str() --- examples/compress/compress.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/compress/compress.cpp b/examples/compress/compress.cpp index 7636019e2..a0f79005f 100644 --- a/examples/compress/compress.cpp +++ b/examples/compress/compress.cpp @@ -73,13 +73,13 @@ std::vector encode(llama_context *ctx, std::vector inp, gp llama_token candidate = tok.id; if (candidate == inp[index]) { - LOG("%s", llama_token_to_piece(ctx, candidate)); + LOG("%s", llama_token_to_piece(ctx, candidate).c_str()); match = i; break; } } if(match<0){ - LOG_ERR("\n couldn't match %s", llama_token_to_piece(ctx, inp[index])); + LOG_ERR("\n couldn't match %s", llama_token_to_piece(ctx, inp[index]).c_str()); exit(1); } sample_ids.push_back(match); From bec83989bed431266ac4d26535b722a6361fade0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 25 Sep 2024 01:26:39 +0200 Subject: [PATCH 6/8] compress: format --- examples/compress/compress.cpp | 60 +++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/examples/compress/compress.cpp b/examples/compress/compress.cpp index a0f79005f..bd2756afa 100644 --- a/examples/compress/compress.cpp +++ b/examples/compress/compress.cpp @@ -78,7 +78,8 @@ std::vector encode(llama_context *ctx, std::vector inp, gp break; } } - if(match<0){ + if (match < 0) + { LOG_ERR("\n couldn't match %s", llama_token_to_piece(ctx, inp[index]).c_str()); exit(1); } @@ -133,14 +134,13 @@ std::vector encode(llama_context *ctx, std::vector inp, gp int block_size = (bit_offset + PAD) / 8 - block_start; if (block_size >= 256) { - // TODO: handle more than 256 bytes of block data + // TODO: handle more than 256 bytes of block data // (maybe allow multiple blocks in a row) LOG_ERR("Block too big %d >= 256", block_size); exit(-1); } sample_ids_bitpacked[block_start + 1] = block_size & 0xff; - // put last bytes if (PAD) { @@ -212,7 +212,7 @@ std::vector encode(llama_context *ctx, std::vector inp, gp int block_size = (bit_offset + PAD) / 8 - block_start; // endianness: big endian sample_ids_bitpacked[block_start + 1] = block_size & 0xff; - total_pad+=PAD; + total_pad += PAD; } llama_batch_free(batch); return sample_ids_bitpacked; @@ -330,7 +330,7 @@ std::vector decode(llama_context *ctx, gpt_sampler *smpl, std::vect auto &cur_p = smpl->cur_p; // initialized by set_logits llama_sampler_apply(smpl->chain, &cur_p); - + auto token_id = cur_p.data[sample_id].id; out.push_back(token_id); if (!inp.size() || token_id == inp[num_raw_tokens_header + index]) @@ -482,7 +482,7 @@ int main(int argc, char **argv) params.sparams.top_p = 1; params.sparams.top_k = -1; // Avoid temp=0 because greedy sampling breaks stuff - params.sparams.temp = 1.; + params.sparams.temp = 1.; gpt_init(); @@ -544,38 +544,43 @@ int main(int argc, char **argv) auto t_enc_end = ggml_time_us(); LOG("\n"); - if(!params.no_perf){ + if (!params.no_perf) + { LOG("\nInput: %d characters (%d tokens)", params.prompt.length(), inp.size()); float compressed_bits_per_token = 8 * (float)sample_ids_bitpacked.size() / (float)inp.size(); float compressed_bits_per_char = 8 * (float)sample_ids_bitpacked.size() / (float)params.prompt.length(); LOG("\n%d compressed bytes,(%04f bits per token, %04f bits per character)\n", (int)sample_ids_bitpacked.size(), compressed_bits_per_token, compressed_bits_per_char); - LOG("\n%d padding bits, (%04f bits per character without padding)", total_pad, compressed_bits_per_char - total_pad/(float)params.prompt.length()); - LOG("\nPPL (over)estimation: %04f (%04f with padding)", exp2(compressed_bits_per_token-total_pad/(float)inp.size()),exp2(compressed_bits_per_token)); + LOG("\n%d padding bits, (%04f bits per character without padding)", total_pad, compressed_bits_per_char - total_pad / (float)params.prompt.length()); + LOG("\nPPL (over)estimation: %04f (%04f with padding)", exp2(compressed_bits_per_token - total_pad / (float)inp.size()), exp2(compressed_bits_per_token)); } - //maybe this needs to be changed - if(params.out_file != "imatrix.dat"){ + // maybe this needs to be changed + if (params.out_file != "imatrix.dat") + { // dump uint8array to bin file std::ofstream ofs(params.out_file.c_str(), std::ios::binary); - ofs.write((char*)&sample_ids_bitpacked[0], sample_ids_bitpacked.size()); + ofs.write((char *)&sample_ids_bitpacked[0], sample_ids_bitpacked.size()); ofs.close(); - }else{ + } + else + { LOG("\n------------\n"); - //print as hex to stdout - for (int i = 0; i < sample_ids_bitpacked.size(); i++){ + // print as hex to stdout + for (int i = 0; i < sample_ids_bitpacked.size(); i++) + { LOG("%02X ", sample_ids_bitpacked[i]); } } - } else if (params.compress_mode == 2) { - //decompress mode - // load sample_ids_bitpacked from params.prompt_file + // decompress mode + // load sample_ids_bitpacked from params.prompt_file std::ifstream ifs(params.prompt_file.c_str(), std::ios::binary); - if (!ifs) { + if (!ifs) + { LOG_ERR("%s: failed to open file\n", __func__); return -1; } @@ -588,14 +593,16 @@ int main(int argc, char **argv) std::vector sample_ids_bitpacked(fileSize); // Read the ifs into the vector - if (!ifs.read(reinterpret_cast(sample_ids_bitpacked.data()), fileSize)) { + if (!ifs.read(reinterpret_cast(sample_ids_bitpacked.data()), fileSize)) + { LOG_ERR("%s: failed to read file\n", __func__); return -1; } ifs.close(); - //Debug: print as hex - for (int i = 0; i < sample_ids_bitpacked.size(); i++){ + // Debug: print as hex + for (int i = 0; i < sample_ids_bitpacked.size(); i++) + { LOG("%02X ", sample_ids_bitpacked[i]); } LOG("\n"); @@ -612,23 +619,22 @@ int main(int argc, char **argv) std::vector out = decode(ctx, smpl, sample_ids_bitpacked); - gpt_sampler_free(smpl); auto t_dec_end = ggml_time_us(); - //maybe this needs to be changed - if(params.out_file != "imatrix.dat"){ + // maybe this needs to be changed + if (params.out_file != "imatrix.dat") + { // dump as string to file std::string out_str = ::llama_detokenize(ctx, out); std::ofstream ofs(params.out_file.c_str(), std::ios::binary); - ofs.write((char*)&out_str[0], out_str.size()); + ofs.write((char *)&out_str[0], out_str.size()); ofs.close(); } llama_free(ctx); llama_free_model(model); - } llama_backend_free(); From da444fafd76331240a4668114cd034622bc6c97c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 25 Sep 2024 11:56:47 +0200 Subject: [PATCH 7/8] compress: remove sampling.cpp dependency --- examples/compress/compress.cpp | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/examples/compress/compress.cpp b/examples/compress/compress.cpp index bd2756afa..62981ec19 100644 --- a/examples/compress/compress.cpp +++ b/examples/compress/compress.cpp @@ -1,7 +1,6 @@ #include "arg.h" #include "common.h" #include "sampling.h" -#include "sampling.cpp" #include "log.h" #include "llama.h" @@ -60,16 +59,15 @@ std::vector encode(llama_context *ctx, std::vector inp, gp std::vector sample_ids; - smpl->set_logits(ctx, num_raw_tokens_header - 1); + gpt_sampler_sample(smpl, ctx, num_raw_tokens_header - 1, true); for (int index = num_raw_tokens_header; index < inp.size(); index++) { - auto &cur_p = smpl->cur_p; // initialized by set_logits - llama_sampler_apply(smpl->chain, &cur_p); + auto cur_p = gpt_sampler_get_candidates(smpl); // initialized by set_logits int match = -1; - for (int i = 0; i < cur_p.size; i++) + for (int i = 0; i < cur_p->size; i++) { - auto tok = cur_p.data[i]; + auto tok = cur_p->data[i]; llama_token candidate = tok.id; if (candidate == inp[index]) { @@ -91,7 +89,7 @@ std::vector encode(llama_context *ctx, std::vector inp, gp LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); exit(1); } - smpl->set_logits(ctx, 0); + gpt_sampler_sample(smpl, ctx, 0, true); } // bit pack sample_ids @@ -247,7 +245,7 @@ std::vector decode(llama_context *ctx, gpt_sampler *smpl, std::vect exit(1); } - smpl->set_logits(ctx, num_raw_tokens_header - 1); + gpt_sampler_sample(smpl, ctx, num_raw_tokens_header - 1, true); int index = 0; int bit_index = (1 + num_raw_tokens_header * 4) * 8; @@ -268,10 +266,9 @@ std::vector decode(llama_context *ctx, gpt_sampler *smpl, std::vect sample_id |= (int)sample_ids_bitpacked[i + (bit_index / 8)]; } - auto &cur_p = smpl->cur_p; // initialized by set_logits - llama_sampler_apply(smpl->chain, &cur_p); + auto cur_p = gpt_sampler_get_candidates(smpl); // initialized by set_logits - auto token_id = cur_p.data[sample_id].id; + auto token_id = cur_p->data[sample_id].id; out.push_back(token_id); @@ -303,7 +300,8 @@ std::vector decode(llama_context *ctx, gpt_sampler *smpl, std::vect LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); exit(1); } - smpl->set_logits(ctx, 0); + gpt_sampler_sample(smpl, ctx, 0, true); + index++; bit_index += 8 * (fixed_token_cost + bytesize); @@ -328,10 +326,9 @@ std::vector decode(llama_context *ctx, gpt_sampler *smpl, std::vect { int sample_id = id; - auto &cur_p = smpl->cur_p; // initialized by set_logits - llama_sampler_apply(smpl->chain, &cur_p); + auto cur_p = gpt_sampler_get_candidates(smpl); // initialized by set_logits - auto token_id = cur_p.data[sample_id].id; + auto token_id = cur_p->data[sample_id].id; out.push_back(token_id); if (!inp.size() || token_id == inp[num_raw_tokens_header + index]) { @@ -350,7 +347,7 @@ std::vector decode(llama_context *ctx, gpt_sampler *smpl, std::vect LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); exit(1); } - smpl->set_logits(ctx, 0); + gpt_sampler_sample(smpl, ctx, 0, true); } index++; From d3df98d6eaa293ff0a3ba7498f1f72cd393ae905 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 25 Sep 2024 12:07:41 +0200 Subject: [PATCH 8/8] compress: add cmath --- examples/compress/compress.cpp | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/examples/compress/compress.cpp b/examples/compress/compress.cpp index 62981ec19..b56ca6808 100644 --- a/examples/compress/compress.cpp +++ b/examples/compress/compress.cpp @@ -5,23 +5,13 @@ #include "llama.h" #include +#include #include #include #include #include #include -int msb_log2(int x) -{ - int ret = 0; - while (x > 0) - { - ret++; - x >>= 1; - } - return ret; -} - int msB_log256(int x) { int ret = 0; @@ -92,7 +82,6 @@ std::vector encode(llama_context *ctx, std::vector inp, gp gpt_sampler_sample(smpl, ctx, 0, true); } - // bit pack sample_ids std::vector sample_ids_bitpacked; int bit_offset = 0;