From 1d6059a5e2e422384265b364a03db0d6182aa3f1 Mon Sep 17 00:00:00 2001 From: JohannesGaessler Date: Wed, 7 Feb 2024 19:44:38 +0100 Subject: [PATCH] count token combinations --- examples/lookup-static/lookup-static.cpp | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/examples/lookup-static/lookup-static.cpp b/examples/lookup-static/lookup-static.cpp index 9e68fa86f..2424b4b5f 100644 --- a/examples/lookup-static/lookup-static.cpp +++ b/examples/lookup-static/lookup-static.cpp @@ -2,15 +2,17 @@ #include "llama.h" #include +#include #include #include #include #include #include +#include #include int main(int argc, char ** argv){ - const char * static_input_file = "./wikitext-2-raw/wiki.test.raw"; + const char * static_input_file = "./wikitext-2-raw/wiki.train.raw"; std::ifstream file(static_input_file); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", static_input_file); @@ -56,7 +58,23 @@ int main(int argc, char ** argv){ LOG("add_bos tgt: %d\n", add_bos); std::vector inp; - inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); + std::vector inp_static; + inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); + inp_static = ::llama_tokenize(ctx, static_input, add_bos, true); + + std::unordered_map hashmap = {}; + for (size_t i = 0; i < inp_static.size()-1; ++i) { + const int64_t key_low = inp_static[i + 0] << 0; + const int64_t key_high = inp_static[i + 1] << 32; + const int64_t key = key_low | key_high; + + if (hashmap.count(key) != 0) { + continue; + } + + hashmap.emplace(std::make_pair(key, -1)); + } + printf("\n\n%ld\n\n", hashmap.size()); const int max_context_size = llama_n_ctx(ctx); const int max_tokens_list_size = max_context_size - 4;