common : use common_ prefix for common library functions (#9805)
* common : use common_ prefix for common library functions --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
0e9f760eb1
commit
7eee341bee
45 changed files with 1284 additions and 1284 deletions
|
@ -13,13 +13,13 @@
|
|||
#include <vector>
|
||||
|
||||
int main(int argc, char ** argv){
|
||||
gpt_params params;
|
||||
common_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
gpt_init();
|
||||
common_init();
|
||||
|
||||
// max. number of additional tokens to draft if match is found
|
||||
const int n_draft = params.n_draft;
|
||||
|
@ -31,29 +31,29 @@ int main(int argc, char ** argv){
|
|||
llama_numa_init(params.numa);
|
||||
|
||||
// load the model
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
common_init_result llama_init = common_init_from_params(params);
|
||||
|
||||
llama_model * model = llama_init.model;
|
||||
llama_context * ctx = llama_init.context;
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||
inp = common_tokenize(ctx, params.prompt, true, true);
|
||||
|
||||
llama_ngram_cache ngram_cache_context;
|
||||
llama_ngram_cache ngram_cache_dynamic;
|
||||
llama_ngram_cache ngram_cache_static;
|
||||
common_ngram_cache ngram_cache_context;
|
||||
common_ngram_cache ngram_cache_dynamic;
|
||||
common_ngram_cache ngram_cache_static;
|
||||
int64_t t_draft_flat_us = 0;
|
||||
int64_t t_draft_us = 0;
|
||||
|
||||
{
|
||||
// Fill up context ngram cache with tokens from user input:
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
|
||||
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
|
||||
|
||||
if (!params.lookup_cache_static.empty()) {
|
||||
try {
|
||||
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
||||
ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
|
||||
} catch (std::ifstream::failure const &) {
|
||||
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
||||
exit(1);
|
||||
|
@ -62,7 +62,7 @@ int main(int argc, char ** argv){
|
|||
|
||||
if (!params.lookup_cache_dynamic.empty()) {
|
||||
try {
|
||||
ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
|
||||
ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
|
||||
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
|
||||
}
|
||||
|
||||
|
@ -80,7 +80,7 @@ int main(int argc, char ** argv){
|
|||
LOG("\n\n");
|
||||
|
||||
for (auto id : inp) {
|
||||
LOG("%s", llama_token_to_piece(ctx, id).c_str());
|
||||
LOG("%s", common_token_to_piece(ctx, id).c_str());
|
||||
}
|
||||
|
||||
fflush(stderr);
|
||||
|
@ -102,7 +102,7 @@ int main(int argc, char ** argv){
|
|||
|
||||
bool has_eos = false;
|
||||
|
||||
struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
|
||||
struct common_sampler * smpl = common_sampler_init(model, params.sparams);
|
||||
|
||||
std::vector<llama_token> draft;
|
||||
|
||||
|
@ -117,7 +117,7 @@ int main(int argc, char ** argv){
|
|||
// debug
|
||||
if (dump_kv_cache) {
|
||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||
llama_kv_cache_dump_view_seqs(kvc_view, 40);
|
||||
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
||||
}
|
||||
|
||||
// print current draft sequence
|
||||
|
@ -126,11 +126,11 @@ int main(int argc, char ** argv){
|
|||
int i_dft = 0;
|
||||
while (true) {
|
||||
// sample from the target model
|
||||
llama_token id = gpt_sampler_sample(smpl, ctx, i_dft);
|
||||
llama_token id = common_sampler_sample(smpl, ctx, i_dft);
|
||||
|
||||
gpt_sampler_accept(smpl, id, true);
|
||||
common_sampler_accept(smpl, id, true);
|
||||
|
||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||
const std::string token_str = common_token_to_piece(ctx, id);
|
||||
|
||||
if (!params.use_color) {
|
||||
LOG("%s", token_str.c_str());
|
||||
|
@ -152,7 +152,7 @@ int main(int argc, char ** argv){
|
|||
{
|
||||
// Update context ngram cache with the newly accepted token:
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
||||
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||
}
|
||||
|
||||
|
@ -178,7 +178,7 @@ int main(int argc, char ** argv){
|
|||
{
|
||||
// Update context ngram cache with the newly accepted token:
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
||||
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||
}
|
||||
break;
|
||||
|
@ -192,18 +192,18 @@ int main(int argc, char ** argv){
|
|||
// clean the cache of draft tokens that weren't accepted
|
||||
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||
|
||||
llama_batch_clear(batch_tgt);
|
||||
llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
|
||||
common_batch_clear(batch_tgt);
|
||||
common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
|
||||
|
||||
// Draft already contains a single token sampled from the model:
|
||||
GGML_ASSERT(draft.size() == 1);
|
||||
GGML_ASSERT(draft[0] == inp.back());
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
|
||||
llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
||||
common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
||||
|
||||
for (size_t i = 1; i < draft.size(); ++i) {
|
||||
llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
|
||||
common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
|
||||
}
|
||||
|
||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||
|
@ -218,8 +218,8 @@ int main(int argc, char ** argv){
|
|||
auto t_dec_end = ggml_time_us();
|
||||
|
||||
// Update dynamic ngram cache with context ngram cache and save it to disk:
|
||||
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
||||
llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
|
||||
common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
||||
common_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
|
||||
|
||||
LOG("\n\n");
|
||||
|
||||
|
@ -237,9 +237,9 @@ int main(int argc, char ** argv){
|
|||
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||
|
||||
LOG_INF("\ntarget:\n\n");
|
||||
gpt_perf_print(ctx, smpl);
|
||||
common_perf_print(ctx, smpl);
|
||||
|
||||
gpt_sampler_free(smpl);
|
||||
common_sampler_free(smpl);
|
||||
|
||||
llama_batch_free(batch_tgt);
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue