clean code
This commit is contained in:
parent
15f06f6b4f
commit
ed10def70e
2 changed files with 33 additions and 81 deletions
5
main.cpp
5
main.cpp
|
@ -886,6 +886,7 @@ int main(int argc, char ** argv) {
|
||||||
printf(ANSI_COLOR_YELLOW);
|
printf(ANSI_COLOR_YELLOW);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// buffering UTF-8 tokens like <0xE6>,<0xAC><0xA2> spanning across multiple output to make it complete.
|
||||||
std::vector<gpt_vocab::id> buffids = {};
|
std::vector<gpt_vocab::id> buffids = {};
|
||||||
while (remaining_tokens > 0) {
|
while (remaining_tokens > 0) {
|
||||||
// predict
|
// predict
|
||||||
|
@ -949,9 +950,7 @@ int main(int argc, char ** argv) {
|
||||||
// display text
|
// display text
|
||||||
if (!input_noecho) {
|
if (!input_noecho) {
|
||||||
untokenize(sp, buffids, embd);
|
untokenize(sp, buffids, embd);
|
||||||
// for (auto id : embd) {
|
|
||||||
// printf("%s", vocab.id_to_token[id].c_str());
|
|
||||||
// }
|
|
||||||
// reset color to default if we there is no pending user input
|
// reset color to default if we there is no pending user input
|
||||||
if (params.use_color && embd_inp.size() <= input_consumed) {
|
if (params.use_color && embd_inp.size() <= input_consumed) {
|
||||||
printf(ANSI_COLOR_RESET);
|
printf(ANSI_COLOR_RESET);
|
||||||
|
|
109
utils.cpp
109
utils.cpp
|
@ -542,85 +542,38 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
|
||||||
return (n/k)*row_size;
|
return (n/k)*row_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_vocab::id> & buffids, std::vector<gpt_vocab::id> & embd)
|
void untokenize(sentencepiece::SentencePieceProcessor &sp, std::vector<gpt_vocab::id> &buffids, std::vector<gpt_vocab::id> &embd)
|
||||||
{
|
{
|
||||||
// std::string output = sp.DecodeIds(embd);
|
for (auto id : embd)
|
||||||
// printf("%s", output.c_str());
|
{
|
||||||
// return;
|
std::string s = sp.IdToPiece(id); // vocab.id_to_token[id];
|
||||||
// Convert the IDs in embd to tokens using SentencePiece
|
|
||||||
// std::vector<gpt_vocab::id> pieces;
|
|
||||||
// for (const auto& id : embd) {
|
|
||||||
// //std::string s = sp.DecodeIds(id);
|
|
||||||
|
|
||||||
// //s = std::regex_replace(s, std::regex("▁"), " ");
|
if (s.find("<0x") == 0 && s[s.length() - 1] == '>')
|
||||||
|
{
|
||||||
// // if (s.find("<0x") == 0 && s[s.length() - 1] == '>')
|
buffids.push_back(id);
|
||||||
// // {
|
std::string txt = sp.DecodeIds(buffids);
|
||||||
// // s = sp.IdToPiece(id);
|
// printf("bufferring %s, total buffer: %s\n", s.c_str(), txt.c_str());
|
||||||
// // }
|
|
||||||
// //printf("%s", s.c_str());
|
|
||||||
|
|
||||||
// pieces.push_back(id);
|
|
||||||
// // if(s.length() > 1)
|
|
||||||
// // tokens.push_back(" ");
|
|
||||||
// }
|
|
||||||
// // Insert spaces between tokens
|
|
||||||
// // std::string text;
|
|
||||||
// // for (const auto& token : tokens) {
|
|
||||||
// // // Add a space before the token if it is not the first token and it doesn't start with a special character
|
|
||||||
// // if (!text.empty() && !(token[0] == '\0x25' && token[1] == '\0x81') && token[0] != ' ') {
|
|
||||||
// // text += ' ';
|
|
||||||
// // }
|
|
||||||
// // text += sp.DecodePieces(tokens);
|
|
||||||
// // }
|
|
||||||
// //sp.DecodeIds(embd);
|
|
||||||
// std::string text =
|
|
||||||
// sp.DecodeIds(pieces);
|
|
||||||
|
|
||||||
// printf("%s", text.c_str());
|
|
||||||
|
|
||||||
std::string buff;
|
|
||||||
for (auto id : embd) {
|
|
||||||
std::string s = sp.IdToPiece(id); //vocab.id_to_token[id];
|
|
||||||
|
|
||||||
if (s.find("<0x") == 0 && s[s.length() - 1] == '>')
|
|
||||||
{
|
|
||||||
buffids.push_back(id);
|
|
||||||
// Extract the hexadecimal value from the token
|
|
||||||
std::string hex_value = s.substr(s.find("0x"));
|
|
||||||
|
|
||||||
// Convert the hexadecimal value to binary and print it
|
|
||||||
int decimal_value;
|
|
||||||
std::stringstream(hex_value) >> std::hex >> decimal_value;
|
|
||||||
std::bitset<8> binary_value(decimal_value);
|
|
||||||
|
|
||||||
char* bytes = reinterpret_cast<char*>(&decimal_value);
|
|
||||||
buff = buff + std::string(bytes);
|
|
||||||
//printf("bufferring %s, total buffer: %s\n", s.c_str(), buff.c_str());
|
|
||||||
}
|
|
||||||
else if(s.find("▁") == 0)
|
|
||||||
{
|
|
||||||
if(!buff.empty())
|
|
||||||
{
|
|
||||||
std::string txt = sp.DecodeIds(buffids);
|
|
||||||
printf("%s", txt.c_str());
|
|
||||||
buffids.clear();
|
|
||||||
buff = "";
|
|
||||||
}
|
|
||||||
s = std::regex_replace(s, std::regex("▁"), " ");
|
|
||||||
//s.replace(0, 2, 1, ' ');
|
|
||||||
printf("%s", s.c_str());
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if(!buff.empty())
|
|
||||||
{
|
|
||||||
std::string txt = sp.DecodeIds(buffids);
|
|
||||||
printf("%s", txt.c_str());
|
|
||||||
buffids.clear();
|
|
||||||
buff = "";
|
|
||||||
}
|
|
||||||
printf("%s", s.c_str());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
else if (s.find("▁") == 0)
|
||||||
|
{
|
||||||
|
if (!buffids.empty())
|
||||||
|
{
|
||||||
|
std::string txt = sp.DecodeIds(buffids);
|
||||||
|
printf("%s", txt.c_str());
|
||||||
|
buffids.clear();
|
||||||
|
}
|
||||||
|
s = std::regex_replace(s, std::regex("▁"), " ");
|
||||||
|
printf("%s", s.c_str());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (!buffids.empty())
|
||||||
|
{
|
||||||
|
std::string txt = sp.DecodeIds(buffids);
|
||||||
|
printf("%s", txt.c_str());
|
||||||
|
buffids.clear();
|
||||||
|
}
|
||||||
|
printf("%s", s.c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
Loading…
Add table
Add a link
Reference in a new issue