whisper : tokenizer fix + re-enable tokenizer test for LLaMa (#3096)
* Fix für #2721 * Reenable tokenizer test for LLaMa * Add `console.cpp` dependency * Fix dependency to `common` * Fixing wrong fix. * Make console usage platform specific Work on compiler warnings. * Adapting makefile * Remove trailing whitespace * Adapting the other parts of the makefile * Fix typo.
This commit is contained in:
parent
1b6c650d16
commit
71ca2fad7d
6 changed files with 142 additions and 118 deletions
|
@ -3121,10 +3121,9 @@ struct llm_tokenizer_spm {
|
|||
while (offs < text.size()) {
|
||||
llm_symbol sym;
|
||||
size_t len = utf8_len(text[offs]);
|
||||
GGML_ASSERT(offs + len <= text.size());
|
||||
sym.text = text.c_str() + offs;
|
||||
sym.n = len;
|
||||
offs += len;
|
||||
sym.n = std::min(len, text.size() - offs);
|
||||
offs += sym.n;
|
||||
sym.prev = index - 1;
|
||||
sym.next = offs == text.size() ? -1 : index + 1;
|
||||
index++;
|
||||
|
@ -6218,7 +6217,7 @@ int llama_tokenize_with_model(
|
|||
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
||||
|
||||
if (n_max_tokens < (int) res.size()) {
|
||||
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
||||
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
||||
return -((int) res.size());
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue