update ngram-cache
This commit is contained in:
parent
aee57d44c6
commit
cd097710d7
6 changed files with 82 additions and 82 deletions
|
@ -8,7 +8,7 @@
|
|||
#include <fstream>
|
||||
#include <thread>
|
||||
|
||||
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
||||
void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
||||
std::vector<llama_token> & inp, int nnew, bool print_progress) {
|
||||
const int64_t t_start_ms = ggml_time_ms();
|
||||
const int64_t inp_size = inp.size();
|
||||
|
@ -20,16 +20,16 @@ void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, in
|
|||
const int64_t i_start = std::max(inp_size - nnew, ngram_size);
|
||||
for (int64_t i = i_start; i < inp_size; ++i) {
|
||||
const int64_t ngram_start = i - ngram_size;
|
||||
llama_ngram ngram(&inp[ngram_start], ngram_size);
|
||||
common_ngram ngram(&inp[ngram_start], ngram_size);
|
||||
const llama_token token = inp[i];
|
||||
|
||||
llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
|
||||
common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
|
||||
if (part_it == ngram_cache.end()) {
|
||||
llama_ngram_cache_part part;
|
||||
common_ngram_cache_part part;
|
||||
part.emplace(token, 1);
|
||||
ngram_cache.emplace(ngram, part);
|
||||
} else {
|
||||
llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
|
||||
common_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
|
||||
if (token_count_it == part_it->second.end()) {
|
||||
part_it->second.emplace(token, 1);
|
||||
} else {
|
||||
|
@ -62,12 +62,12 @@ constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2};
|
|||
constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
|
||||
|
||||
// Helper function that tries to draft a token from only the static ngram cache:
|
||||
static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) {
|
||||
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
||||
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
|
||||
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
||||
if (part_static_it == nc_static.end()) {
|
||||
return -1;
|
||||
}
|
||||
const llama_ngram_cache_part part_static = part_static_it->second;
|
||||
const common_ngram_cache_part part_static = part_static_it->second;
|
||||
|
||||
int max_count_static = 0;
|
||||
int sum_count_static = 0;
|
||||
|
@ -95,19 +95,19 @@ static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ng
|
|||
|
||||
// Try to draft a token from primary cache (context/dynamic), validate with static cache:
|
||||
static llama_token try_draft(
|
||||
llama_ngram_cache & nc_primary, const std::vector<llama_ngram> & ngrams_primary, llama_ngram_cache_part & part_static,
|
||||
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
|
||||
const int * min_sample_size, const int * min_percent) {
|
||||
|
||||
llama_token drafted_token = -1;
|
||||
|
||||
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
|
||||
const llama_ngram ngram_primary = ngrams_primary[i];
|
||||
const common_ngram ngram_primary = ngrams_primary[i];
|
||||
|
||||
llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
|
||||
common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
|
||||
if (part_primary_it == nc_primary.end()) {
|
||||
continue;
|
||||
}
|
||||
const llama_ngram_cache_part part_primary = part_primary_it->second;
|
||||
const common_ngram_cache_part part_primary = part_primary_it->second;
|
||||
|
||||
int max_count_primary = 0;
|
||||
int max_count_static = 0;
|
||||
|
@ -117,7 +117,7 @@ static llama_token try_draft(
|
|||
for (std::pair<llama_token, int> token_count_primary : part_primary) {
|
||||
const llama_token token = token_count_primary.first;
|
||||
|
||||
llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
|
||||
common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
|
||||
|
||||
const int32_t count_primary = token_count_primary.second;
|
||||
const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
|
||||
|
@ -142,9 +142,9 @@ static llama_token try_draft(
|
|||
return drafted_token;
|
||||
}
|
||||
|
||||
void llama_ngram_cache_draft(
|
||||
void common_ngram_cache_draft(
|
||||
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
||||
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static
|
||||
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
|
||||
) {
|
||||
GGML_ASSERT(draft.size() == 1);
|
||||
const int inp_size = inp.size();
|
||||
|
@ -157,21 +157,21 @@ void llama_ngram_cache_draft(
|
|||
llama_token drafted_token = -1;
|
||||
|
||||
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
|
||||
llama_ngram ngram_static;
|
||||
common_ngram ngram_static;
|
||||
for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
|
||||
ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
|
||||
}
|
||||
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
||||
llama_ngram_cache_part part_static;
|
||||
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
||||
common_ngram_cache_part part_static;
|
||||
if (part_static_it != nc_static.end()) {
|
||||
part_static = part_static_it->second;
|
||||
}
|
||||
|
||||
// cd = context + dynamic
|
||||
std::vector<llama_ngram> ngrams_cd;
|
||||
std::vector<common_ngram> ngrams_cd;
|
||||
for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
|
||||
const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
|
||||
llama_ngram ngram_cd;
|
||||
common_ngram ngram_cd;
|
||||
for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
|
||||
ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
|
||||
}
|
||||
|
@ -196,16 +196,16 @@ void llama_ngram_cache_draft(
|
|||
}
|
||||
}
|
||||
|
||||
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) {
|
||||
void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) {
|
||||
std::ofstream file_out(filename, std::ios::binary);
|
||||
for (std::pair<llama_ngram, llama_ngram_cache_part> item : ngram_cache) {
|
||||
const llama_ngram ngram = item.first;
|
||||
llama_ngram_cache_part token_counts = item.second;
|
||||
for (std::pair<common_ngram, common_ngram_cache_part> item : ngram_cache) {
|
||||
const common_ngram ngram = item.first;
|
||||
common_ngram_cache_part token_counts = item.second;
|
||||
GGML_ASSERT(!token_counts.empty());
|
||||
const int32_t ntokens = token_counts.size();
|
||||
GGML_ASSERT(ntokens > 0);
|
||||
|
||||
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(llama_ngram));
|
||||
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(common_ngram));
|
||||
file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
|
||||
for (std::pair<llama_token, int32_t> item2 : token_counts) {
|
||||
const llama_token token = item2.first;
|
||||
|
@ -219,14 +219,14 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen
|
|||
|
||||
}
|
||||
|
||||
llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
|
||||
common_ngram_cache common_ngram_cache_load(std::string & filename) {
|
||||
std::ifstream hashmap_file(filename, std::ios::binary);
|
||||
if (!hashmap_file) {
|
||||
throw std::ifstream::failure("Unable to open file " + filename);
|
||||
}
|
||||
llama_ngram_cache ngram_cache;
|
||||
common_ngram_cache ngram_cache;
|
||||
|
||||
llama_ngram ngram;
|
||||
common_ngram ngram;
|
||||
int32_t ntokens;
|
||||
llama_token token;
|
||||
int32_t count;
|
||||
|
@ -235,11 +235,11 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
|
|||
char * ntokensc = reinterpret_cast<char*>(&ntokens);
|
||||
char * tokenc = reinterpret_cast<char*>(&token);
|
||||
char * countc = reinterpret_cast<char*>(&count);
|
||||
while(hashmap_file.read(ngramc, sizeof(llama_ngram))) {
|
||||
while(hashmap_file.read(ngramc, sizeof(common_ngram))) {
|
||||
GGML_ASSERT(!hashmap_file.eof());
|
||||
GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
|
||||
GGML_ASSERT(ntokens > 0);
|
||||
llama_ngram_cache_part token_counts;
|
||||
common_ngram_cache_part token_counts;
|
||||
|
||||
for (int i = 0; i < ntokens; ++i) {
|
||||
GGML_ASSERT(!hashmap_file.eof());
|
||||
|
@ -257,12 +257,12 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
|
|||
return ngram_cache;
|
||||
}
|
||||
|
||||
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
|
||||
for (std::pair<llama_ngram, llama_ngram_cache_part> ngram_part : ngram_cache_add) {
|
||||
const llama_ngram ngram = ngram_part.first;
|
||||
llama_ngram_cache_part part = ngram_part.second;
|
||||
void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add) {
|
||||
for (std::pair<common_ngram, common_ngram_cache_part> ngram_part : ngram_cache_add) {
|
||||
const common_ngram ngram = ngram_part.first;
|
||||
common_ngram_cache_part part = ngram_part.second;
|
||||
|
||||
llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
|
||||
common_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
|
||||
if (part_merged_it == ngram_cache_target.end()) {
|
||||
ngram_cache_target.emplace(ngram, part);
|
||||
continue;
|
||||
|
@ -273,7 +273,7 @@ void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram
|
|||
const int32_t count = token_count.second;
|
||||
GGML_ASSERT(count > 0);
|
||||
|
||||
llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
|
||||
common_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
|
||||
if (token_count_merged_it == part_merged_it->second.end()) {
|
||||
part_merged_it->second.emplace(token, count);
|
||||
continue;
|
||||
|
|
|
@ -12,22 +12,22 @@
|
|||
|
||||
// Data structures to map n-grams to empirical token probabilities:
|
||||
|
||||
struct llama_ngram {
|
||||
struct common_ngram {
|
||||
llama_token tokens[LLAMA_NGRAM_MAX];
|
||||
|
||||
llama_ngram() {
|
||||
common_ngram() {
|
||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||
tokens[i] = -1;
|
||||
}
|
||||
}
|
||||
|
||||
llama_ngram(const llama_token * input, const int ngram_size) {
|
||||
common_ngram(const llama_token * input, const int ngram_size) {
|
||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||
tokens[i] = i < ngram_size ? input[i] : -1;
|
||||
}
|
||||
}
|
||||
|
||||
bool operator==(const llama_ngram & other) const {
|
||||
bool operator==(const common_ngram & other) const {
|
||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||
if (tokens[i] != other.tokens[i]) {
|
||||
return false;
|
||||
|
@ -37,28 +37,28 @@ struct llama_ngram {
|
|||
}
|
||||
};
|
||||
|
||||
struct llama_token_hash_function {
|
||||
struct common_token_hash_function {
|
||||
size_t operator()(const llama_token token) const {
|
||||
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
|
||||
return token * 11400714819323198485llu;
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_ngram_hash_function {
|
||||
size_t operator()(const llama_ngram & ngram) const {
|
||||
size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
|
||||
struct common_ngram_hash_function {
|
||||
size_t operator()(const common_ngram & ngram) const {
|
||||
size_t hash = common_token_hash_function{}(ngram.tokens[0]);
|
||||
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
|
||||
hash ^= llama_token_hash_function{}(ngram.tokens[i]);
|
||||
hash ^= common_token_hash_function{}(ngram.tokens[i]);
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
};
|
||||
|
||||
// token -> number of times token has been seen
|
||||
typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part;
|
||||
typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;
|
||||
|
||||
// n-gram -> empirical distribution of following tokens
|
||||
typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache;
|
||||
typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
|
||||
|
||||
|
||||
// Update an ngram cache with tokens.
|
||||
|
@ -70,8 +70,8 @@ typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash
|
|||
//
|
||||
// In order to get correct results inp_data can ONLY BE APPENDED TO.
|
||||
// Changes in the middle need a complete rebuild.
|
||||
void llama_ngram_cache_update(
|
||||
llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
|
||||
void common_ngram_cache_update(
|
||||
common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
|
||||
|
||||
// Try to draft tokens from ngram caches.
|
||||
// inp: the tokens generated so far.
|
||||
|
@ -81,21 +81,21 @@ void llama_ngram_cache_update(
|
|||
// nc_context: ngram cache based on current context.
|
||||
// nc_dynamic: ngram cache based on previous user generations.
|
||||
// nc_static: ngram cache generated from a large text corpus, used for validation.
|
||||
void llama_ngram_cache_draft(
|
||||
void common_ngram_cache_draft(
|
||||
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
||||
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);
|
||||
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
|
||||
|
||||
// Save an ngram cache to a file.
|
||||
// ngram_cache: the ngram cache to save.
|
||||
// filename: the path under which to save the ngram cache.
|
||||
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
|
||||
void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename);
|
||||
|
||||
// Load an ngram cache saved with llama_ngram_cache_save.
|
||||
// Load an ngram cache saved with common_ngram_cache_save.
|
||||
// filename: the path from which to load the ngram cache.
|
||||
// returns: an ngram cache containing the information saved to filename.
|
||||
llama_ngram_cache llama_ngram_cache_load(std::string & filename);
|
||||
common_ngram_cache common_ngram_cache_load(std::string & filename);
|
||||
|
||||
// Merge two ngram caches.
|
||||
// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
|
||||
// ngram_cache_add: the ngram cache to add to ngram_cache_target.
|
||||
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add);
|
||||
void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add);
|
||||
|
|
|
@ -35,11 +35,11 @@ int main(int argc, char ** argv){
|
|||
fprintf(stderr, "%s: tokenization done\n", __func__);
|
||||
|
||||
|
||||
llama_ngram_cache ngram_cache;
|
||||
llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
|
||||
common_ngram_cache ngram_cache;
|
||||
common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
|
||||
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
||||
|
||||
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
|
||||
common_ngram_cache_save(ngram_cache, params.lookup_cache_static);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -33,15 +33,15 @@ int main(int argc, char ** argv){
|
|||
}
|
||||
|
||||
fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
|
||||
llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]);
|
||||
common_ngram_cache ngram_cache_merged = common_ngram_cache_load(args[0]);
|
||||
|
||||
for (size_t i = 1; i < args.size()-1; ++i) {
|
||||
fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
|
||||
llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]);
|
||||
common_ngram_cache ngram_cache = common_ngram_cache_load(args[i]);
|
||||
|
||||
llama_ngram_cache_merge(ngram_cache_merged, ngram_cache);
|
||||
common_ngram_cache_merge(ngram_cache_merged, ngram_cache);
|
||||
}
|
||||
|
||||
fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str());
|
||||
llama_ngram_cache_save(ngram_cache_merged, args.back());
|
||||
common_ngram_cache_save(ngram_cache_merged, args.back());
|
||||
}
|
||||
|
|
|
@ -37,9 +37,9 @@ int main(int argc, char ** argv){
|
|||
std::vector<llama_token> inp;
|
||||
inp = common_tokenize(ctx, params.prompt, true, true);
|
||||
|
||||
llama_ngram_cache ngram_cache_context;
|
||||
llama_ngram_cache ngram_cache_dynamic;
|
||||
llama_ngram_cache ngram_cache_static;
|
||||
common_ngram_cache ngram_cache_context;
|
||||
common_ngram_cache ngram_cache_dynamic;
|
||||
common_ngram_cache ngram_cache_static;
|
||||
int64_t t_draft_flat_us = 0;
|
||||
int64_t t_draft_us = 0;
|
||||
|
||||
|
@ -48,7 +48,7 @@ int main(int argc, char ** argv){
|
|||
|
||||
if (!params.lookup_cache_static.empty()) {
|
||||
try {
|
||||
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
||||
ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
|
||||
} catch (std::ifstream::failure const &) {
|
||||
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
||||
exit(1);
|
||||
|
@ -57,7 +57,7 @@ int main(int argc, char ** argv){
|
|||
|
||||
if (!params.lookup_cache_dynamic.empty()) {
|
||||
try {
|
||||
ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
|
||||
ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
|
||||
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
|
||||
}
|
||||
|
||||
|
@ -86,7 +86,7 @@ int main(int argc, char ** argv){
|
|||
|
||||
{
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
||||
common_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||
}
|
||||
|
||||
|
@ -105,7 +105,7 @@ int main(int argc, char ** argv){
|
|||
|
||||
{
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
|
||||
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
|
||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||
}
|
||||
}
|
||||
|
@ -115,7 +115,7 @@ int main(int argc, char ** argv){
|
|||
pseudo_output.push_back(inp_slice[pseudo_output.size()]);
|
||||
{
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
|
||||
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
|
||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||
}
|
||||
}
|
||||
|
@ -133,7 +133,7 @@ int main(int argc, char ** argv){
|
|||
}
|
||||
|
||||
// After each chunk, update the dynamic ngram cache with the context ngram cache:
|
||||
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
||||
common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
||||
ngram_cache_context.clear();
|
||||
}
|
||||
|
||||
|
|
|
@ -40,20 +40,20 @@ int main(int argc, char ** argv){
|
|||
std::vector<llama_token> inp;
|
||||
inp = common_tokenize(ctx, params.prompt, true, true);
|
||||
|
||||
llama_ngram_cache ngram_cache_context;
|
||||
llama_ngram_cache ngram_cache_dynamic;
|
||||
llama_ngram_cache ngram_cache_static;
|
||||
common_ngram_cache ngram_cache_context;
|
||||
common_ngram_cache ngram_cache_dynamic;
|
||||
common_ngram_cache ngram_cache_static;
|
||||
int64_t t_draft_flat_us = 0;
|
||||
int64_t t_draft_us = 0;
|
||||
|
||||
{
|
||||
// Fill up context ngram cache with tokens from user input:
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
|
||||
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
|
||||
|
||||
if (!params.lookup_cache_static.empty()) {
|
||||
try {
|
||||
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
||||
ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
|
||||
} catch (std::ifstream::failure const &) {
|
||||
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
||||
exit(1);
|
||||
|
@ -62,7 +62,7 @@ int main(int argc, char ** argv){
|
|||
|
||||
if (!params.lookup_cache_dynamic.empty()) {
|
||||
try {
|
||||
ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
|
||||
ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
|
||||
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
|
||||
}
|
||||
|
||||
|
@ -152,7 +152,7 @@ int main(int argc, char ** argv){
|
|||
{
|
||||
// Update context ngram cache with the newly accepted token:
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
||||
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||
}
|
||||
|
||||
|
@ -178,7 +178,7 @@ int main(int argc, char ** argv){
|
|||
{
|
||||
// Update context ngram cache with the newly accepted token:
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
||||
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||
}
|
||||
break;
|
||||
|
@ -200,7 +200,7 @@ int main(int argc, char ** argv){
|
|||
GGML_ASSERT(draft[0] == inp.back());
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
|
||||
llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
||||
common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
||||
|
||||
for (size_t i = 1; i < draft.size(); ++i) {
|
||||
common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
|
||||
|
@ -218,8 +218,8 @@ int main(int argc, char ** argv){
|
|||
auto t_dec_end = ggml_time_us();
|
||||
|
||||
// Update dynamic ngram cache with context ngram cache and save it to disk:
|
||||
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
||||
llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
|
||||
common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
||||
common_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
|
||||
|
||||
LOG("\n\n");
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue