perplexity : add log for start of tokenization
This commit is contained in:
parent
630d8b408a
commit
fae8faa135
2 changed files with 5 additions and 0 deletions
|
@ -41,6 +41,8 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||||
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||||
const bool add_bos = is_spm;
|
const bool add_bos = is_spm;
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
|
|
||||||
const int calc_chunk = params.n_ctx;
|
const int calc_chunk = params.n_ctx;
|
||||||
|
@ -152,6 +154,8 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||||
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||||
const bool add_bos = is_spm;
|
const bool add_bos = is_spm;
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
|
|
||||||
const int n_chunk_max = tokens.size() / params.n_ctx;
|
const int n_chunk_max = tokens.size() / params.n_ctx;
|
||||||
|
|
|
@ -3321,6 +3321,7 @@ private:
|
||||||
}
|
}
|
||||||
|
|
||||||
// probably not 100% correct
|
// probably not 100% correct
|
||||||
|
// TODO: this is quite slow - how to make it more efficient?
|
||||||
static std::vector<std::string> bpe_gpt2_preprocess(std::string text) {
|
static std::vector<std::string> bpe_gpt2_preprocess(std::string text) {
|
||||||
std::vector<std::string> words;
|
std::vector<std::string> words;
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue