From 76d2794e11f481aef9c920f32357b952bb815120 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 01:47:31 +0200
Subject: [PATCH] bug fixes in tokenize_file

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 8f35fe2c9..52495a6b3 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1319,6 +1319,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
 
     std::vector<char> buf;
     buf.resize(size+1);
+    out.resize(size+1);
 
     if (std::fread(buf.data(), size, 1, fp) != 1) {
         throw std::runtime_error(std::string("unexpectedly reached end of file"));
@@ -1332,8 +1333,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
     int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
     if (n_tokens < 0) {
         out.resize(-n_tokens);
-        llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
+        n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
     }
+    GGML_ASSERT(n_tokens >= 0);
+    out.resize(n_tokens);
 
     bool verify = false;
     if (verify) {