llama_tokenizer() in fact requires valid utf8
This commit is contained in:
parent
8e027f8dcd
commit
ca88ce7b77
1 changed files with 9 additions and 3 deletions
|
@ -21,6 +21,7 @@ static LlgConstraint *llama_sampler_llg_new(LlgTokenizer *tokenizer,
|
||||||
const char * grammar_kind, const char * grammar_data) {
|
const char * grammar_kind, const char * grammar_data) {
|
||||||
LlgConstraintInit cinit;
|
LlgConstraintInit cinit;
|
||||||
llg_constraint_init_set_defaults(&cinit, tokenizer);
|
llg_constraint_init_set_defaults(&cinit, tokenizer);
|
||||||
|
// cinit.log_stderr_level = 2;
|
||||||
auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data);
|
auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data);
|
||||||
if (llg_get_error(c)) {
|
if (llg_get_error(c)) {
|
||||||
LOG_ERR("llg error: %s\n", llg_get_error(c));
|
LOG_ERR("llg error: %s\n", llg_get_error(c));
|
||||||
|
@ -135,8 +136,13 @@ static size_t llama_sampler_llg_tokenize_fn(const void *user_data,
|
||||||
size_t output_tokens_len)
|
size_t output_tokens_len)
|
||||||
{
|
{
|
||||||
const llama_vocab *vocab = (const llama_vocab *)user_data;
|
const llama_vocab *vocab = (const llama_vocab *)user_data;
|
||||||
int r = llama_tokenize(vocab, (const char *) bytes, bytes_len,
|
int r = 0;
|
||||||
(int32_t*)output_tokens, output_tokens_len, false, true);
|
try {
|
||||||
|
r = llama_tokenize(vocab, (const char *) bytes, bytes_len,
|
||||||
|
(int32_t*)output_tokens, output_tokens_len, false, true);
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
GGML_ABORT("llama_tokenize failed: %s\n", e.what());
|
||||||
|
}
|
||||||
if (r < 0)
|
if (r < 0)
|
||||||
return -r;
|
return -r;
|
||||||
return r;
|
return r;
|
||||||
|
@ -197,7 +203,7 @@ static LlgTokenizer *llama_sampler_llg_new_tokenizer(const llama_vocab * vocab)
|
||||||
/* .token_lens = */ token_lens,
|
/* .token_lens = */ token_lens,
|
||||||
/* .token_bytes = */ token_bytes,
|
/* .token_bytes = */ token_bytes,
|
||||||
/* .tokenizer_json = */ nullptr,
|
/* .tokenizer_json = */ nullptr,
|
||||||
/* .tokenize_assumes_string = */ false,
|
/* .tokenize_assumes_string = */ true,
|
||||||
/* .tokenize_fn = */ llama_sampler_llg_tokenize_fn,
|
/* .tokenize_fn = */ llama_sampler_llg_tokenize_fn,
|
||||||
/* .use_approximate_greedy_tokenize_fn = */ false,
|
/* .use_approximate_greedy_tokenize_fn = */ false,
|
||||||
/* .tokenize_user_data = */ vocab,
|
/* .tokenize_user_data = */ vocab,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue