buffering utf-8 output to make it complete for spliting output.
This commit is contained in:
parent
86e967c54b
commit
15f06f6b4f
3 changed files with 13 additions and 6 deletions
3
main.cpp
3
main.cpp
|
@ -886,6 +886,7 @@ int main(int argc, char ** argv) {
|
||||||
printf(ANSI_COLOR_YELLOW);
|
printf(ANSI_COLOR_YELLOW);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<gpt_vocab::id> buffids = {};
|
||||||
while (remaining_tokens > 0) {
|
while (remaining_tokens > 0) {
|
||||||
// predict
|
// predict
|
||||||
if (embd.size() > 0) {
|
if (embd.size() > 0) {
|
||||||
|
@ -947,7 +948,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// display text
|
// display text
|
||||||
if (!input_noecho) {
|
if (!input_noecho) {
|
||||||
untokenize(sp, embd);
|
untokenize(sp, buffids, embd);
|
||||||
// for (auto id : embd) {
|
// for (auto id : embd) {
|
||||||
// printf("%s", vocab.id_to_token[id].c_str());
|
// printf("%s", vocab.id_to_token[id].c_str());
|
||||||
// }
|
// }
|
||||||
|
|
12
utils.cpp
12
utils.cpp
|
@ -542,7 +542,7 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
|
||||||
return (n/k)*row_size;
|
return (n/k)*row_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_vocab::id> & embd)
|
void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_vocab::id> & buffids, std::vector<gpt_vocab::id> & embd)
|
||||||
{
|
{
|
||||||
// std::string output = sp.DecodeIds(embd);
|
// std::string output = sp.DecodeIds(embd);
|
||||||
// printf("%s", output.c_str());
|
// printf("%s", output.c_str());
|
||||||
|
@ -578,12 +578,14 @@ void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_voca
|
||||||
// sp.DecodeIds(pieces);
|
// sp.DecodeIds(pieces);
|
||||||
|
|
||||||
// printf("%s", text.c_str());
|
// printf("%s", text.c_str());
|
||||||
|
|
||||||
std::string buff;
|
std::string buff;
|
||||||
for (auto id : embd) {
|
for (auto id : embd) {
|
||||||
std::string s = sp.IdToPiece(id); //vocab.id_to_token[id];
|
std::string s = sp.IdToPiece(id); //vocab.id_to_token[id];
|
||||||
|
|
||||||
if (s.find("<0x") == 0 && s[s.length() - 1] == '>')
|
if (s.find("<0x") == 0 && s[s.length() - 1] == '>')
|
||||||
{
|
{
|
||||||
|
buffids.push_back(id);
|
||||||
// Extract the hexadecimal value from the token
|
// Extract the hexadecimal value from the token
|
||||||
std::string hex_value = s.substr(s.find("0x"));
|
std::string hex_value = s.substr(s.find("0x"));
|
||||||
|
|
||||||
|
@ -600,7 +602,9 @@ void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_voca
|
||||||
{
|
{
|
||||||
if(!buff.empty())
|
if(!buff.empty())
|
||||||
{
|
{
|
||||||
printf("%s", buff.c_str());
|
std::string txt = sp.DecodeIds(buffids);
|
||||||
|
printf("%s", txt.c_str());
|
||||||
|
buffids.clear();
|
||||||
buff = "";
|
buff = "";
|
||||||
}
|
}
|
||||||
s = std::regex_replace(s, std::regex("▁"), " ");
|
s = std::regex_replace(s, std::regex("▁"), " ");
|
||||||
|
@ -611,7 +615,9 @@ void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_voca
|
||||||
{
|
{
|
||||||
if(!buff.empty())
|
if(!buff.empty())
|
||||||
{
|
{
|
||||||
printf("%s", buff.c_str());
|
std::string txt = sp.DecodeIds(buffids);
|
||||||
|
printf("%s", txt.c_str());
|
||||||
|
buffids.clear();
|
||||||
buff = "";
|
buff = "";
|
||||||
}
|
}
|
||||||
printf("%s", s.c_str());
|
printf("%s", s.c_str());
|
||||||
|
|
2
utils.h
2
utils.h
|
@ -105,5 +105,5 @@ void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int
|
||||||
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
|
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
|
||||||
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
|
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
|
||||||
|
|
||||||
void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_vocab::id> & embd);
|
void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_vocab::id> & buffids, std::vector<gpt_vocab::id> & embd);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue