From 76d2794e11f481aef9c920f32357b952bb815120 Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 28 Aug 2023 01:47:31 +0200 Subject: [PATCH] bug fixes in tokenize_file --- examples/train-text-from-scratch/train-text-from-scratch.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 8f35fe2c9..52495a6b3 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1319,6 +1319,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto std::vector buf; buf.resize(size+1); + out.resize(size+1); if (std::fread(buf.data(), size, 1, fp) != 1) { throw std::runtime_error(std::string("unexpectedly reached end of file")); @@ -1332,8 +1333,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false); if (n_tokens < 0) { out.resize(-n_tokens); - llama_tokenize(lctx, buf.data(), out.data(), out.size(), false); + n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false); } + GGML_ASSERT(n_tokens >= 0); + out.resize(n_tokens); bool verify = false; if (verify) {