buffering output for UTF-8 encoded token
This commit is contained in:
parent
1b87fe1e90
commit
86e967c54b
1 changed files with 16 additions and 1 deletions
17
utils.cpp
17
utils.cpp
|
@ -544,6 +544,9 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
|
||||||
|
|
||||||
void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_vocab::id> & embd)
|
void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_vocab::id> & embd)
|
||||||
{
|
{
|
||||||
|
// std::string output = sp.DecodeIds(embd);
|
||||||
|
// printf("%s", output.c_str());
|
||||||
|
// return;
|
||||||
// Convert the IDs in embd to tokens using SentencePiece
|
// Convert the IDs in embd to tokens using SentencePiece
|
||||||
// std::vector<gpt_vocab::id> pieces;
|
// std::vector<gpt_vocab::id> pieces;
|
||||||
// for (const auto& id : embd) {
|
// for (const auto& id : embd) {
|
||||||
|
@ -575,6 +578,7 @@ void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_voca
|
||||||
// sp.DecodeIds(pieces);
|
// sp.DecodeIds(pieces);
|
||||||
|
|
||||||
// printf("%s", text.c_str());
|
// printf("%s", text.c_str());
|
||||||
|
std::string buff;
|
||||||
for (auto id : embd) {
|
for (auto id : embd) {
|
||||||
std::string s = sp.IdToPiece(id); //vocab.id_to_token[id];
|
std::string s = sp.IdToPiece(id); //vocab.id_to_token[id];
|
||||||
|
|
||||||
|
@ -589,16 +593,27 @@ void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_voca
|
||||||
std::bitset<8> binary_value(decimal_value);
|
std::bitset<8> binary_value(decimal_value);
|
||||||
|
|
||||||
char* bytes = reinterpret_cast<char*>(&decimal_value);
|
char* bytes = reinterpret_cast<char*>(&decimal_value);
|
||||||
printf("%s", bytes);
|
buff = buff + std::string(bytes);
|
||||||
|
//printf("bufferring %s, total buffer: %s\n", s.c_str(), buff.c_str());
|
||||||
}
|
}
|
||||||
else if(s.find("▁") == 0)
|
else if(s.find("▁") == 0)
|
||||||
{
|
{
|
||||||
|
if(!buff.empty())
|
||||||
|
{
|
||||||
|
printf("%s", buff.c_str());
|
||||||
|
buff = "";
|
||||||
|
}
|
||||||
s = std::regex_replace(s, std::regex("▁"), " ");
|
s = std::regex_replace(s, std::regex("▁"), " ");
|
||||||
//s.replace(0, 2, 1, ' ');
|
//s.replace(0, 2, 1, ' ');
|
||||||
printf("%s", s.c_str());
|
printf("%s", s.c_str());
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
if(!buff.empty())
|
||||||
|
{
|
||||||
|
printf("%s", buff.c_str());
|
||||||
|
buff = "";
|
||||||
|
}
|
||||||
printf("%s", s.c_str());
|
printf("%s", s.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue