From 7930caf24c509698b68c47f342de552070dbf0cb Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 16 Sep 2023 20:36:43 +0200
Subject: [PATCH] fix usage of llama_tokenize

---
 common/train.cpp | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/common/train.cpp b/common/train.cpp
index 9357fab0f..ef147b140 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -853,10 +853,22 @@ size_t tokenize_file(
         // tokenize all data at once
         out_tokens.resize(buf.size() + n_max_tokens_overhead);
 
-        int n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), (int) out_tokens.size(), false);
+        int n_tokens = llama_tokenize(
+            lctx,
+            buf.data(),
+            (int) buf.size(),
+            out_tokens.data(),
+            (int) out_tokens.size(),
+            false);
         if (n_tokens < 0) {
             out_tokens.resize(-n_tokens);
-            n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), (int) out_tokens.size(), false);
+            n_tokens = llama_tokenize(
+                lctx,
+                buf.data(),
+                (int) buf.size(),
+                out_tokens.data(),
+                (int) out_tokens.size(),
+                false);
         }
         if (n_tokens >= 0) {
             out_tokens.resize(n_tokens);
@@ -948,14 +960,18 @@ size_t tokenize_file(
                 tok_sample.resize(buf_sample.size() + n_max_tokens_overhead);
                 int n_tokens = llama_tokenize(lctx,
                     buf_sample.data(),
+                    (int) buf_sample.size(),
                     tok_sample.data(),
-                    (int) tok_sample.size(), false);
+                    (int) tok_sample.size(),
+                    false);
                 if (n_tokens < 0) {
                     tok_sample.resize(-n_tokens);
                     n_tokens = llama_tokenize(lctx,
                         buf_sample.data(),
+                        (int) buf_sample.size(),
                         tok_sample.data(),
-                        (int) tok_sample.size(), false);
+                        (int) tok_sample.size(),
+                        false);
                     GGML_ASSERT(n_tokens >= 0);
                 }
                 GGML_ASSERT(n_tokens <= (int) tok_sample.size());