From ddf5ac257ae63fa5fb301571b4da74389262b06a Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 17 Sep 2023 12:48:17 +0200
Subject: [PATCH] use new/delete for train_state instead of malloc/free

using malloc may result in seg faults when trying to assign string fields
---
 common/train.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/common/train.cpp b/common/train.cpp
index e54f9b5fe..fd34e026e 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -18,7 +18,7 @@ struct random_uniform_distribution {
 };
 
 struct train_state  * init_train_state() {
-    struct train_state * state = (struct train_state *) malloc(sizeof(struct train_state));
+    struct train_state * state = new struct train_state;
     state->train_its     = 0;
     state->train_samples = 0;
     state->train_tokens  = 0;
@@ -29,16 +29,16 @@ struct train_state  * init_train_state() {
     state->shuffle_rng_state_current = "";
     state->shuffle_rng_state_next    = "";
 
-    state->opt = (struct ggml_opt_context *) malloc(sizeof(struct ggml_opt_context));
-    memset(state->opt, 0, sizeof(struct ggml_opt_context));
+    state->opt = new struct ggml_opt_context;
+    state->opt->ctx = NULL;
     state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
 
     return state;
 }
 
 void free_train_state(struct train_state  * state) {
-    free(state->opt);
-    free(state);
+    delete state->opt;
+    delete state;
 }
 
 struct random_normal_distribution * init_random_normal_distribution(
@@ -932,7 +932,7 @@ size_t tokenize_file(
                                     : (i+1 < out_samples_begin.size()
                                         ? out_samples_begin[i+1]
                                         : data_str.size());
-            if (utf8_units[sample_end] > 0) {
+            if (sample_end < utf8_units.size() && utf8_units[sample_end] > 0) {
                 // sample end is in the middle of an utf8 character.
                 // advance sample_end to the begin of the next utf8 character.
                 sample_end += utf8_nunits[sample_end] - utf8_units[sample_end];