From 7930caf24c509698b68c47f342de552070dbf0cb Mon Sep 17 00:00:00 2001 From: xaedes Date: Sat, 16 Sep 2023 20:36:43 +0200 Subject: [PATCH] fix usage of llama_tokenize --- common/train.cpp | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/common/train.cpp b/common/train.cpp index 9357fab0f..ef147b140 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -853,10 +853,22 @@ size_t tokenize_file( // tokenize all data at once out_tokens.resize(buf.size() + n_max_tokens_overhead); - int n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), (int) out_tokens.size(), false); + int n_tokens = llama_tokenize( + lctx, + buf.data(), + (int) buf.size(), + out_tokens.data(), + (int) out_tokens.size(), + false); if (n_tokens < 0) { out_tokens.resize(-n_tokens); - n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), (int) out_tokens.size(), false); + n_tokens = llama_tokenize( + lctx, + buf.data(), + (int) buf.size(), + out_tokens.data(), + (int) out_tokens.size(), + false); } if (n_tokens >= 0) { out_tokens.resize(n_tokens); @@ -948,14 +960,18 @@ size_t tokenize_file( tok_sample.resize(buf_sample.size() + n_max_tokens_overhead); int n_tokens = llama_tokenize(lctx, buf_sample.data(), + (int) buf_sample.size(), tok_sample.data(), - (int) tok_sample.size(), false); + (int) tok_sample.size(), + false); if (n_tokens < 0) { tok_sample.resize(-n_tokens); n_tokens = llama_tokenize(lctx, buf_sample.data(), + (int) buf_sample.size(), tok_sample.data(), - (int) tok_sample.size(), false); + (int) tok_sample.size(), + false); GGML_ASSERT(n_tokens >= 0); } GGML_ASSERT(n_tokens <= (int) tok_sample.size());