remove terminating '\0' from tokenization
(llama_tokenize is now passed the string length instead of relying on terminating '\0')
This commit is contained in:
parent
9db2664dd1
commit
dd3e7634f0
1 changed files with 3 additions and 5 deletions
|
@ -832,10 +832,9 @@ size_t tokenize_file(
|
||||||
const int n_max_tokens_overhead = 1;
|
const int n_max_tokens_overhead = 1;
|
||||||
|
|
||||||
std::vector<char> buf;
|
std::vector<char> buf;
|
||||||
buf.resize(f.size+1);
|
buf.resize(f.size);
|
||||||
|
|
||||||
f.read_raw(buf.data(), f.size);
|
f.read_raw(buf.data(), f.size);
|
||||||
buf[f.size] = '\0';
|
|
||||||
|
|
||||||
std::vector<int> utf8_units;
|
std::vector<int> utf8_units;
|
||||||
std::vector<int> utf8_nunits;
|
std::vector<int> utf8_nunits;
|
||||||
|
@ -879,7 +878,7 @@ size_t tokenize_file(
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// split data into samples and tokenize each sample
|
// split data into samples and tokenize each sample
|
||||||
std::string data_str(buf.data(), buf.size()-1);
|
std::string data_str(buf.data(), buf.size());
|
||||||
out_samples_begin.clear();
|
out_samples_begin.clear();
|
||||||
out_samples_size.clear();
|
out_samples_size.clear();
|
||||||
out_tokens.clear();
|
out_tokens.clear();
|
||||||
|
@ -944,9 +943,8 @@ size_t tokenize_file(
|
||||||
if (sample_size > 0) {
|
if (sample_size > 0) {
|
||||||
// llama_tokenize expects zero terminated string,
|
// llama_tokenize expects zero terminated string,
|
||||||
// copy sample into buffer and zero terminate it.
|
// copy sample into buffer and zero terminate it.
|
||||||
buf_sample.resize(sample_size+1);
|
buf_sample.resize(sample_size);
|
||||||
memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
|
memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
|
||||||
buf_sample[sample_size] = '\0';
|
|
||||||
|
|
||||||
// printf("sample: '%s'\n", buf_sample.data());
|
// printf("sample: '%s'\n", buf_sample.data());
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue