remove terminating '\0' from tokenization

(llama_tokenize is now passed the string length instead of relying on terminating '\0')
This commit is contained in:
xaedes 2023-09-16 21:30:49 +02:00
parent 9db2664dd1
commit dd3e7634f0
No known key found for this signature in database
GPG key ID: 30030EDD817EA2B1

View file

@ -832,10 +832,9 @@ size_t tokenize_file(
const int n_max_tokens_overhead = 1;
std::vector<char> buf;
buf.resize(f.size+1);
buf.resize(f.size);
f.read_raw(buf.data(), f.size);
buf[f.size] = '\0';
std::vector<int> utf8_units;
std::vector<int> utf8_nunits;
@ -879,7 +878,7 @@ size_t tokenize_file(
}
} else {
// split data into samples and tokenize each sample
std::string data_str(buf.data(), buf.size()-1);
std::string data_str(buf.data(), buf.size());
out_samples_begin.clear();
out_samples_size.clear();
out_tokens.clear();
@ -944,9 +943,8 @@ size_t tokenize_file(
if (sample_size > 0) {
// llama_tokenize expects zero terminated string,
// copy sample into buffer and zero terminate it.
buf_sample.resize(sample_size+1);
buf_sample.resize(sample_size);
memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
buf_sample[sample_size] = '\0';
// printf("sample: '%s'\n", buf_sample.data());