remove terminating '\0' from tokenization

(llama_tokenize is now passed the string length instead of relying on terminating '\0')
2023-09-16 21:30:49 +02:00 · 2023-09-16 21:30:49 +02:00 · dd3e7634f0
commit dd3e7634f0
parent 9db2664dd1
1 changed files with 3 additions and 5 deletions
--- a/common/train.cpp
+++ b/common/train.cpp
@ -832,10 +832,9 @@ size_t tokenize_file(
    const int n_max_tokens_overhead = 1;

    std::vector<char> buf;
-    buf.resize(f.size+1);
+    buf.resize(f.size);

    f.read_raw(buf.data(), f.size);
-    buf[f.size] = '\0';

    std::vector<int> utf8_units;
    std::vector<int> utf8_nunits;
@ -879,7 +878,7 @@ size_t tokenize_file(
        }
    } else {
        // split data into samples and tokenize each sample
-        std::string data_str(buf.data(), buf.size()-1);
+        std::string data_str(buf.data(), buf.size());
        out_samples_begin.clear();
        out_samples_size.clear();
        out_tokens.clear();
@ -944,9 +943,8 @@ size_t tokenize_file(
            if (sample_size > 0) {
                // llama_tokenize expects zero terminated string,
                // copy sample into buffer and zero terminate it.
-                buf_sample.resize(sample_size+1);
+                buf_sample.resize(sample_size);
                memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
-                buf_sample[sample_size] = '\0';

                // printf("sample: '%s'\n", buf_sample.data());