some corrections and added as cmake option
This commit is contained in:
parent
da7f370a94
commit
733b566bac
4 changed files with 141 additions and 100 deletions
|
@ -71,6 +71,7 @@ option(LLAMA_CLBLAST "llama: use CLBlast"
|
||||||
|
|
||||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||||
|
option(LLAMA_BUILD_SERVER "llama: build server example" OFF)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build info header
|
# Build info header
|
||||||
|
|
|
@ -36,6 +36,8 @@ else()
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
add_subdirectory(save-load-state)
|
add_subdirectory(save-load-state)
|
||||||
add_subdirectory(benchmark)
|
add_subdirectory(benchmark)
|
||||||
add_subdirectory(server)
|
|
||||||
add_subdirectory(baby-llama)
|
add_subdirectory(baby-llama)
|
||||||
|
if(LLAMA_BUILD_SERVER)
|
||||||
|
add_subdirectory(server)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -73,7 +73,8 @@ You can interact with this API Endpoints. This implementations just support chat
|
||||||
|
|
||||||
- **POST** `hostname:port/completion`: Setting up the Llama Context to begin the completions tasks.
|
- **POST** `hostname:port/completion`: Setting up the Llama Context to begin the completions tasks.
|
||||||
|
|
||||||
Options:
|
*Options:*
|
||||||
|
|
||||||
`batch_size`: Set the batch size for prompt processing (default: 512).
|
`batch_size`: Set the batch size for prompt processing (default: 512).
|
||||||
|
|
||||||
`temperature`: Adjust the randomness of the generated text (default: 0.8).
|
`temperature`: Adjust the randomness of the generated text (default: 0.8).
|
||||||
|
@ -100,6 +101,8 @@ Options:
|
||||||
|
|
||||||
- **POST** `hostname:port/embedding`: Generate embedding of a given text
|
- **POST** `hostname:port/embedding`: Generate embedding of a given text
|
||||||
|
|
||||||
|
*Options:*
|
||||||
|
|
||||||
`content`: Set the text to get generate the embedding.
|
`content`: Set the text to get generate the embedding.
|
||||||
|
|
||||||
`threads`: Set the number of threads to use during computation.
|
`threads`: Set the number of threads to use during computation.
|
||||||
|
@ -108,10 +111,16 @@ To use this endpoint, you need to start the server with the `--embedding` option
|
||||||
|
|
||||||
- **POST** `hostname:port/tokenize`: Tokenize a given text
|
- **POST** `hostname:port/tokenize`: Tokenize a given text
|
||||||
|
|
||||||
|
*Options:*
|
||||||
|
|
||||||
`content`: Set the text to tokenize.
|
`content`: Set the text to tokenize.
|
||||||
|
|
||||||
- **GET** `hostname:port/next-token`: Receive the next token predicted, execute this request in a loop. Make sure set `as_loop` as `true` in the completion request.
|
- **GET** `hostname:port/next-token`: Receive the next token predicted, execute this request in a loop. Make sure set `as_loop` as `true` in the completion request.
|
||||||
|
|
||||||
|
*Options:*
|
||||||
|
|
||||||
|
`stop`: Set `hostname:port/next-token?stop=true` to stop the token generation.
|
||||||
|
|
||||||
## More examples
|
## More examples
|
||||||
|
|
||||||
### Interactive mode
|
### Interactive mode
|
||||||
|
@ -155,6 +164,7 @@ async function ChatCompletion(answer) {
|
||||||
|
|
||||||
let message = "";
|
let message = "";
|
||||||
while (true) {
|
while (true) {
|
||||||
|
// you can stop the inference adding '?stop=true' like this http://127.0.0.1:8080/next-token?stop=true
|
||||||
result = await axios.get("http://127.0.0.1:8080/next-token");
|
result = await axios.get("http://127.0.0.1:8080/next-token");
|
||||||
process.stdout.write(result.data.content);
|
process.stdout.write(result.data.content);
|
||||||
message += result.data.content;
|
message += result.data.content;
|
||||||
|
@ -226,7 +236,7 @@ async function DoInstruction(instruction) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// This function should be called every time a instruction to the model is needed.
|
// This function should be called every time a instruction to the model is needed.
|
||||||
DoInstruction("Destroy the world");
|
DoInstruction("Destroy the world"); // as joke
|
||||||
```
|
```
|
||||||
|
|
||||||
### Embeddings
|
### Embeddings
|
||||||
|
|
|
@ -11,11 +11,11 @@ struct server_params
|
||||||
|
|
||||||
struct llama_server_context
|
struct llama_server_context
|
||||||
{
|
{
|
||||||
bool context_config = false;
|
bool as_loop = false;
|
||||||
bool has_next_token = false;
|
bool has_next_token = false;
|
||||||
bool is_interacting = false;
|
std::string generated_text = "";
|
||||||
|
|
||||||
int32_t tokens_completion = 0;
|
int32_t num_tokens_predicted = 0;
|
||||||
int32_t n_past = 0;
|
int32_t n_past = 0;
|
||||||
int32_t n_consumed = 0;
|
int32_t n_consumed = 0;
|
||||||
int32_t n_session_consumed = 0;
|
int32_t n_session_consumed = 0;
|
||||||
|
@ -27,10 +27,19 @@ struct llama_server_context
|
||||||
std::vector<llama_token> llama_token_newline;
|
std::vector<llama_token> llama_token_newline;
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
std::vector<std::vector<llama_token>> no_show_words;
|
std::vector<std::vector<llama_token>> no_show_words;
|
||||||
|
std::vector<llama_token> tokens_predicted;
|
||||||
|
|
||||||
llama_context *ctx;
|
llama_context *ctx;
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
|
void rewind() {
|
||||||
|
as_loop = false;
|
||||||
|
params.antiprompt.clear();
|
||||||
|
no_show_words.clear();
|
||||||
|
num_tokens_predicted = 0;
|
||||||
|
generated_text = "";
|
||||||
|
}
|
||||||
|
|
||||||
bool loadModel(gpt_params params_)
|
bool loadModel(gpt_params params_)
|
||||||
{
|
{
|
||||||
params = params_;
|
params = params_;
|
||||||
|
@ -123,7 +132,7 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
embd.clear();
|
embd.clear();
|
||||||
if ((int)embd_inp.size() <= n_consumed && !is_interacting)
|
if ((int)embd_inp.size() <= n_consumed && has_next_token)
|
||||||
{
|
{
|
||||||
// out of user input, sample next token
|
// out of user input, sample next token
|
||||||
const float temp = params.temp;
|
const float temp = params.temp;
|
||||||
|
@ -206,6 +215,7 @@ struct llama_server_context
|
||||||
last_n_tokens.erase(last_n_tokens.begin());
|
last_n_tokens.erase(last_n_tokens.begin());
|
||||||
last_n_tokens.push_back(id);
|
last_n_tokens.push_back(id);
|
||||||
processed_tokens.push_back(id);
|
processed_tokens.push_back(id);
|
||||||
|
num_tokens_predicted++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// replace end of text token with newline token when in interactive mode
|
// replace end of text token with newline token when in interactive mode
|
||||||
|
@ -225,7 +235,6 @@ struct llama_server_context
|
||||||
for (auto id : embd)
|
for (auto id : embd)
|
||||||
{
|
{
|
||||||
result = id;
|
result = id;
|
||||||
tokens_completion++;
|
|
||||||
}
|
}
|
||||||
// decrement remaining sampling budget
|
// decrement remaining sampling budget
|
||||||
--n_remain;
|
--n_remain;
|
||||||
|
@ -262,7 +271,6 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos)
|
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos)
|
||||||
{
|
{
|
||||||
is_interacting = true;
|
|
||||||
has_next_token = false;
|
has_next_token = false;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -270,7 +278,7 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
if (n_past > 0)
|
if (n_past > 0)
|
||||||
{
|
{
|
||||||
is_interacting = false;
|
has_next_token = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -281,35 +289,35 @@ struct llama_server_context
|
||||||
if (params.interactive && n_remain <= 0 && params.n_predict != -1)
|
if (params.interactive && n_remain <= 0 && params.n_predict != -1)
|
||||||
{
|
{
|
||||||
n_remain = params.n_predict;
|
n_remain = params.n_predict;
|
||||||
is_interacting = true;
|
|
||||||
}
|
}
|
||||||
has_next_token = n_remain != 0;
|
has_next_token = n_remain != 0;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string inference()
|
std::string doCompletion()
|
||||||
{
|
{
|
||||||
llama_token token = nextToken();
|
llama_token token = nextToken();
|
||||||
if (token == -1) {
|
if (token == -1) {
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
std::vector<llama_token> tokens_completion;
|
tokens_predicted.clear();
|
||||||
tokens_completion.push_back(token);
|
tokens_predicted.push_back(token);
|
||||||
|
|
||||||
// Avoid add the no show words to the response
|
// Avoid add the no show words to the response
|
||||||
for (std::vector<llama_token> word_tokens : no_show_words)
|
for (std::vector<llama_token> word_tokens : no_show_words)
|
||||||
{
|
{
|
||||||
int match_token = 1;
|
int match_token = 1;
|
||||||
if (tokens_completion[0] == word_tokens[0])
|
if (tokens_predicted.front() == word_tokens.front())
|
||||||
{
|
{
|
||||||
bool execute_matching = true;
|
bool execute_matching = true;
|
||||||
if (tokens_completion.size() > 1) { // if previus tokens had been tested
|
if (tokens_predicted.size() > 1) { // if previus tokens had been tested
|
||||||
for (int i = 1; i < word_tokens.size(); i++)
|
for (int i = 1; i < word_tokens.size(); i++)
|
||||||
{
|
{
|
||||||
if (i >= tokens_completion.size()) {
|
if (i >= tokens_predicted.size()) {
|
||||||
match_token = i;
|
match_token = i;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (tokens_completion[i] == word_tokens[i])
|
if (tokens_predicted[i] == word_tokens[i])
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -325,24 +333,26 @@ struct llama_server_context
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
token = nextToken();
|
token = nextToken();
|
||||||
tokens_completion.push_back(token);
|
tokens_predicted.push_back(token);
|
||||||
if (token == word_tokens[match_token])
|
if (token == word_tokens[match_token])
|
||||||
{ // the token follow the sequence
|
{ // the token follow the sequence
|
||||||
match_token++;
|
match_token++;
|
||||||
}
|
}
|
||||||
else if (match_token < word_tokens.size())
|
else if (match_token < word_tokens.size())
|
||||||
{ // no complete all user tag
|
{ // no complete all word sequence
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::string result = "";
|
if(as_loop) {
|
||||||
for (llama_token tkn : tokens_completion)
|
generated_text = "";
|
||||||
{
|
|
||||||
result += llama_token_to_str(ctx, tkn);
|
|
||||||
}
|
}
|
||||||
return result;
|
for (llama_token tkn : tokens_predicted)
|
||||||
|
{
|
||||||
|
generated_text += llama_token_to_str(ctx, tkn);
|
||||||
|
}
|
||||||
|
return generated_text;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<float> embedding(std::string content, int threads) {
|
std::vector<float> embedding(std::string content, int threads) {
|
||||||
|
@ -491,6 +501,76 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool parse_options_completion(json body, llama_server_context& llama, Response &res) {
|
||||||
|
if (!body["threads"].is_null())
|
||||||
|
{
|
||||||
|
llama.params.n_threads = body["threads"].get<int>();
|
||||||
|
}
|
||||||
|
if (!body["n_predict"].is_null())
|
||||||
|
{
|
||||||
|
llama.params.n_predict = body["n_predict"].get<int>();
|
||||||
|
}
|
||||||
|
if (!body["top_k"].is_null())
|
||||||
|
{
|
||||||
|
llama.params.top_k = body["top_k"].get<int>();
|
||||||
|
}
|
||||||
|
if (!body["top_p"].is_null())
|
||||||
|
{
|
||||||
|
llama.params.top_p = body["top_p"].get<float>();
|
||||||
|
}
|
||||||
|
if (!body["temperature"].is_null())
|
||||||
|
{
|
||||||
|
llama.params.temp = body["temperature"].get<float>();
|
||||||
|
}
|
||||||
|
if (!body["batch_size"].is_null())
|
||||||
|
{
|
||||||
|
llama.params.n_batch = body["batch_size"].get<int>();
|
||||||
|
}
|
||||||
|
if (!body["n_keep"].is_null())
|
||||||
|
{
|
||||||
|
llama.params.n_keep = body["n_keep"].get<int>();
|
||||||
|
}
|
||||||
|
if (!body["as_loop"].is_null())
|
||||||
|
{
|
||||||
|
llama.as_loop = body["as_loop"].get<bool>();
|
||||||
|
}
|
||||||
|
if (!body["interactive"].is_null())
|
||||||
|
{
|
||||||
|
llama.params.interactive = body["interactive"].get<bool>();
|
||||||
|
}
|
||||||
|
if (!body["prompt"].is_null())
|
||||||
|
{
|
||||||
|
llama.params.prompt = body["prompt"].get<std::string>();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
json data = {
|
||||||
|
{"status", "error"},
|
||||||
|
{"reason", "You need to pass the prompt"}};
|
||||||
|
res.set_content(data.dump(), "application/json");
|
||||||
|
res.status = 400;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (!body["stop"].is_null())
|
||||||
|
{
|
||||||
|
std::vector<std::string> stop_words = body["stop"].get<std::vector<std::string>>();
|
||||||
|
for (std::string stop_word : stop_words)
|
||||||
|
{
|
||||||
|
llama.params.antiprompt.push_back(stop_word);
|
||||||
|
llama.no_show_words.push_back(::llama_tokenize(llama.ctx, stop_word, false));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!body["exclude"].is_null())
|
||||||
|
{
|
||||||
|
std::vector<std::string> no_show_words = body["exclude"].get<std::vector<std::string>>();
|
||||||
|
for (std::string no_show : no_show_words)
|
||||||
|
{
|
||||||
|
llama.no_show_words.push_back(::llama_tokenize(llama.ctx, no_show, false));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
// own arguments required by this example
|
// own arguments required by this example
|
||||||
|
@ -535,73 +615,12 @@ int main(int argc, char **argv)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
json body = json::parse(req.body);
|
llama.rewind();
|
||||||
llama.params.antiprompt.clear();
|
|
||||||
llama.no_show_words.clear();
|
|
||||||
bool as_loop = false;
|
|
||||||
|
|
||||||
if (!body["threads"].is_null())
|
if(parse_options_completion(json::parse(req.body), llama, res) == false){
|
||||||
{
|
|
||||||
llama.params.n_threads = body["threads"].get<int>();
|
|
||||||
}
|
|
||||||
if (!body["n_predict"].is_null())
|
|
||||||
{
|
|
||||||
llama.params.n_predict = body["n_predict"].get<int>();
|
|
||||||
}
|
|
||||||
if (!body["top_k"].is_null())
|
|
||||||
{
|
|
||||||
llama.params.top_k = body["top_k"].get<int>();
|
|
||||||
}
|
|
||||||
if (!body["top_p"].is_null())
|
|
||||||
{
|
|
||||||
llama.params.top_p = body["top_p"].get<float>();
|
|
||||||
}
|
|
||||||
if (!body["temperature"].is_null())
|
|
||||||
{
|
|
||||||
llama.params.temp = body["temperature"].get<float>();
|
|
||||||
}
|
|
||||||
if (!body["batch_size"].is_null())
|
|
||||||
{
|
|
||||||
llama.params.n_batch = body["batch_size"].get<int>();
|
|
||||||
}
|
|
||||||
if (!body["n_keep"].is_null())
|
|
||||||
{
|
|
||||||
llama.params.n_keep = body["n_keep"].get<int>();
|
|
||||||
}
|
|
||||||
if (!body["as_loop"].is_null())
|
|
||||||
{
|
|
||||||
as_loop = body["as_loop"].get<bool>();
|
|
||||||
}
|
|
||||||
if (!body["interactive"].is_null())
|
|
||||||
{
|
|
||||||
llama.params.interactive = body["interactive"].get<bool>();
|
|
||||||
}
|
|
||||||
if (!body["prompt"].is_null())
|
|
||||||
{
|
|
||||||
llama.params.prompt = body["prompt"].get<std::string>();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
json data = {
|
|
||||||
{"status", "error"},
|
|
||||||
{"reason", "You need to pass the prompt"}};
|
|
||||||
res.set_content(data.dump(), "application/json");
|
|
||||||
res.status = 400;
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (!body["stop"].is_null()) {
|
|
||||||
std::vector<std::string> stop_words = body["stop"].get<std::vector<std::string>>();
|
|
||||||
for (std::string stop_word : stop_words) {
|
|
||||||
llama.params.antiprompt.push_back(stop_word);
|
|
||||||
llama.no_show_words.push_back(::llama_tokenize(llama.ctx, stop_word, false));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!body["exclude"].is_null()) {
|
|
||||||
std::vector<std::string> no_show_words = body["exclude"].get<std::vector<std::string>>();
|
|
||||||
for (std::string no_show : no_show_words) {
|
|
||||||
llama.no_show_words.push_back(::llama_tokenize(llama.ctx, no_show, false));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!llama.loadPrompt())
|
if (!llama.loadPrompt())
|
||||||
{
|
{
|
||||||
json data = {
|
json data = {
|
||||||
|
@ -611,23 +630,33 @@ int main(int argc, char **argv)
|
||||||
res.status = 400;
|
res.status = 400;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama.beginCompletion();
|
llama.beginCompletion();
|
||||||
llama.tokens_completion = 0;
|
if(llama.as_loop) {
|
||||||
if(as_loop) {
|
|
||||||
json data = {
|
json data = {
|
||||||
{"status", "done" } };
|
{"status", "done" } };
|
||||||
return res.set_content(data.dump(), "application/json");
|
return res.set_content(data.dump(), "application/json");
|
||||||
} else {
|
} else {
|
||||||
// Send all completion when finish
|
// loop inference until finish completion
|
||||||
std::string completion = "";
|
|
||||||
while (llama.has_next_token)
|
while (llama.has_next_token)
|
||||||
{
|
{
|
||||||
completion += llama.inference();
|
llama.doCompletion();
|
||||||
}
|
}
|
||||||
|
try
|
||||||
|
{
|
||||||
json data = {
|
json data = {
|
||||||
{"content", completion.c_str()},
|
{"content", llama.generated_text },
|
||||||
{"total_tokens", llama.tokens_completion}};
|
{"tokens_predicted", llama.num_tokens_predicted}};
|
||||||
return res.set_content(data.dump(), "application/json");
|
return res.set_content(data.dump(), "application/json");
|
||||||
|
}
|
||||||
|
catch (json::exception e)
|
||||||
|
{
|
||||||
|
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
|
||||||
|
json data = {
|
||||||
|
{"content", "Bad encoding token"},
|
||||||
|
{"tokens_predicted", 0}};
|
||||||
|
return res.set_content(data.dump(), "application/json");
|
||||||
|
}
|
||||||
} });
|
} });
|
||||||
|
|
||||||
svr.Post("/tokenize", [&llama](const Request &req, Response &res)
|
svr.Post("/tokenize", [&llama](const Request &req, Response &res)
|
||||||
|
@ -664,9 +693,8 @@ int main(int argc, char **argv)
|
||||||
std::string result = "";
|
std::string result = "";
|
||||||
if (req.has_param("stop")) {
|
if (req.has_param("stop")) {
|
||||||
llama.has_next_token = false;
|
llama.has_next_token = false;
|
||||||
llama.is_interacting = true;
|
|
||||||
} else {
|
} else {
|
||||||
result = llama.inference();
|
result = llama.doCompletion(); // inference next token
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
json data = {
|
json data = {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue