From a9c34779f6cc4de3836087a4935cec7eb22db49e Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Fri, 9 Jun 2023 04:47:18 -0400 Subject: [PATCH] Spaces to 4 and other code style cleanup. Notes in README. --- examples/server/README.md | 4 +- examples/server/server.cpp | 1680 ++++++++++++++++++------------------ 2 files changed, 842 insertions(+), 842 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index d98c7a203..364291ee3 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -23,6 +23,8 @@ Command line options: ## Quick Start +**Note:** The server is not built by default. Make sure to add `LLAMA_BUILD_SERVER=ON` to your CMake command. + To get started right away, run the following command, making sure to use the correct path for the model you have: ### Unix-based systems (Linux, macOS, etc.): @@ -99,7 +101,7 @@ node . `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9). - `n_predict`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity). + `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the the limit slightly if the last token is a partial multibyte character. (default: 128, -1 = infinity). `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 94953b35e..2cd880685 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -7,17 +7,17 @@ struct server_params { - std::string hostname = "127.0.0.1"; - int32_t port = 8080; - int32_t read_timeout = 600; - int32_t write_timeout = 600; - bool verbose = false; + std::string hostname = "127.0.0.1"; + int32_t port = 8080; + int32_t read_timeout = 600; + int32_t write_timeout = 600; + bool verbose = false; }; -static size_t common_part(const std::vector & a, const std::vector & b) { - size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++); - return i; +static size_t common_part(const std::vector& a, const std::vector& b) { + size_t i; + for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++); + return i; } enum stop_type { @@ -25,13 +25,13 @@ enum stop_type { STOP_PARTIAL, }; -bool ends_with(const std::string &str, const std::string &suffix) +bool ends_with(const std::string& str, const std::string& suffix) { return str.size() >= suffix.size() && - 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); + 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); } -size_t find_partial_stop_string(const std::string &stop, const std::string &text) +size_t find_partial_stop_string(const std::string& stop, const std::string& text) { if (!text.empty()) { const char text_last_char = text.back(); @@ -47,938 +47,936 @@ size_t find_partial_stop_string(const std::string &stop, const std::string &text return std::string::npos; } -static std::string debug_str(const std::string & s) { - std::string ret; - for (size_t i = 0; s[i]; i++) { - switch (s[i]) { - case '\n': ret += "\\n"; break; - case '"': ret += "\\\""; break; - default: ret += s[i]; break; +static std::string debug_str(const std::string& s) { + std::string ret; + for (size_t i = 0; s[i]; i++) { + switch (s[i]) { + case '\n': ret += "\\n"; break; + case '"': ret += "\\\""; break; + default: ret += s[i]; break; + } } - } - return ret; + return ret; } template -static std::string tokens_to_str(llama_context * ctx, InputIt begin, OutputIt end) { - std::string ret; - for (; begin != end; (void)++begin) { - ret += llama_token_to_str(ctx, *begin); - } - return ret; +static std::string tokens_to_str(llama_context* ctx, InputIt begin, OutputIt end) { + std::string ret; + for (; begin != end; (void)++begin) { + ret += llama_token_to_str(ctx, *begin); + } + return ret; } struct llama_server_context { - bool stream = false; - bool has_next_token = false; - std::string generated_text = ""; + bool stream = false; + bool has_next_token = false; + std::string generated_text = ""; - size_t num_tokens_predicted = 0; - size_t n_past = 0; - size_t n_remain = 0; + size_t num_tokens_predicted = 0; + size_t n_past = 0; + size_t n_remain = 0; - std::vector embd; - std::vector last_n_tokens; + std::vector embd; + std::vector last_n_tokens; - llama_context *ctx = nullptr; - gpt_params params; + llama_context* ctx = nullptr; + gpt_params params; - std::string stopping_word; + std::string stopping_word; - bool verbose = false; - int json_indent = -1; - int32_t multibyte_pending = 0; + bool verbose = false; + int json_indent = -1; + int32_t multibyte_pending = 0; - ~llama_server_context() - { - if (ctx) { - llama_free(ctx); - ctx = nullptr; - } - } - - void rewind() { - params.antiprompt.clear(); - num_tokens_predicted = 0; - generated_text = ""; - generated_text.reserve(params.n_ctx); - stopping_word = ""; - multibyte_pending = 0; - - n_remain = 0; - n_past = 0; - } - - bool loadModel(const gpt_params ¶ms_) - { - params = params_; - ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) + ~llama_server_context() { - fprintf(stderr, "%s: error: unable to load model\n", __func__); - return false; - } - - last_n_tokens.resize(params.n_ctx); - std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); - return true; - } - - void loadPrompt() { - params.prompt.insert(0, 1, ' '); // always add a first space - std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); - - if (params.n_keep < 0) { - params.n_keep = (int)prompt_tokens.size(); - } - params.n_keep = std::min(params.n_ctx - 4, params.n_keep); - - // if input prompt is too big, truncate like normal - if (prompt_tokens.size() >= (size_t)params.n_ctx) { - const int n_left = (params.n_ctx - params.n_keep)/2; - std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); - const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_left - 1) / n_left; - new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end()); - std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin()); - - if (verbose) { - fprintf(stderr, - "input truncated: {\n" - " n_ctx: %d,\n" - " n_keep: %d,\n" - " n_left: %d,\n" - " new_tokens: \"%s\",\n" - "}\n", - params.n_ctx, params.n_keep, n_left, - debug_str(tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())).c_str()); - } - - prompt_tokens = new_tokens; - } else { - const size_t ps = prompt_tokens.size(); - std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0); - std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps); - } - - // compare the evaluated prompt with the new prompt - n_past = common_part(embd, prompt_tokens); - embd = prompt_tokens; - if (n_past == prompt_tokens.size()) { - // we have to evaluate at least 1 token to generate logits. - n_past--; - } - - if (verbose) { - fprintf(stderr, - "prompt: {\n" - " n_past: %zu,\n" - " cached: \"%s\",\n" - " to_eval: \"%s\",\n" - "}\n", - n_past, - debug_str(tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)).c_str(), - debug_str(tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())).c_str()); - } - - has_next_token = true; - } - - void beginCompletion() - { - // number of tokens to keep when resetting context - n_remain = params.n_predict; - llama_set_rng_seed(ctx, params.seed); - } - - llama_token nextToken() { - llama_token result = -1; - - if (embd.size() >= (size_t)params.n_ctx) { - // Reset context - const int n_left = (params.n_ctx - params.n_keep)/2; - - std::vector new_tokens(embd.begin(), embd.begin() + params.n_keep); - new_tokens.insert(new_tokens.end(), embd.end() - n_left, embd.end()); - embd = new_tokens; - n_past = params.n_keep; - if (verbose) { - fprintf(stderr, - "input truncated: {\n" - " n_ctx: %d,\n" - " n_keep: %d,\n" - " n_left: %d,\n" - " new_tokens: \"%s\",\n" - "}\n", - params.n_ctx, params.n_keep, n_left, - debug_str(tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())).c_str()); - } - } - - while (n_past < embd.size()) - { - int n_eval = (int)embd.size() - n_past; - if (n_eval > params.n_batch) - { - n_eval = params.n_batch; - } - if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads)) - { - fprintf(stderr, "%s : failed to eval\n", __func__); - has_next_token = false; - return result; - } - n_past += n_eval; - } - - // out of user input, sample next token - const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; - const float top_p = params.top_p; - const float tfs_z = params.tfs_z; - const float typical_p = params.typical_p; - const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n; - const float repeat_penalty = params.repeat_penalty; - const float alpha_presence = params.presence_penalty; - const float alpha_frequency = params.frequency_penalty; - const int mirostat = params.mirostat; - const float mirostat_tau = params.mirostat_tau; - const float mirostat_eta = params.mirostat_eta; - const bool penalize_nl = params.penalize_nl; - llama_token id = 0; - { - auto *logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(ctx); - - // Apply params.logit_bias map - for (const auto &it : params.logit_bias) { - logits[it.first] += it.second; - } - - std::vector candidates; - candidates.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) - { - candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - - llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false}; - - // Apply penalties - float nl_logit = logits[llama_token_nl()]; - auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx); - llama_sample_repetition_penalty(ctx, &candidates_p, - last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - last_n_repeat, repeat_penalty); - llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, - last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - last_n_repeat, alpha_frequency, alpha_presence); - if (!penalize_nl) - { - logits[llama_token_nl()] = nl_logit; - } - - if (temp <= 0) - { - // Greedy sampling - id = llama_sample_token_greedy(ctx, &candidates_p); - } - else - { - if (mirostat == 1) - { - static float mirostat_mu = 2.0f * mirostat_tau; - const int mirostat_m = 100; - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); - } - else if (mirostat == 2) - { - static float mirostat_mu = 2.0f * mirostat_tau; - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); - } - else - { - // Temperature sampling - llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); - llama_sample_typical(ctx, &candidates_p, typical_p, 1); - llama_sample_top_p(ctx, &candidates_p, top_p, 1); - llama_sample_top_k(ctx, &candidates_p, top_k, 1); - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token(ctx, &candidates_p); - } - } - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(id); - num_tokens_predicted++; - } - - // add it to the context - embd.push_back(id); - result = id; - // decrement remaining sampling budget - --n_remain; - - if (!embd.empty() && embd.back() == llama_token_eos()) { - stopping_word = llama_token_to_str(ctx, embd.back()); - has_next_token = false; - if (verbose) { - fprintf(stderr, "eos token found!\n"); - } - return result; - } - - has_next_token = params.n_predict == -1 || n_remain != 0; - return result; - } - - size_t findStoppingStrings(const std::string &text, const size_t last_token_size, - const stop_type type) - { - size_t stop_pos = std::string::npos; - for (const std::string &word : params.antiprompt) { - size_t pos; - if (type == STOP_FULL) { - const size_t tmp = word.size() + last_token_size; - const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; - pos = text.find(word, from_pos); - } else { - pos = find_partial_stop_string(word, text); - } - if (pos != std::string::npos && - (stop_pos == std::string::npos || pos < stop_pos)) { - if (type == STOP_FULL) { - stopping_word = word; - has_next_token = false; - } - stop_pos = pos; + if (ctx) { + llama_free(ctx); + ctx = nullptr; } } - return stop_pos; - } - std::string doCompletion() - { - llama_token token = nextToken(); - - std::string token_text = token == -1 ? "" : llama_token_to_str(ctx, token); - generated_text += token_text; - - if (multibyte_pending > 0) { - multibyte_pending -= token_text.size(); - } else if (token_text.size() == 1) { - const char c = token_text[0]; - // 2-byte characters: 110xxxxx 10xxxxxx - if ((c & 0xE0) == 0xC0) { - multibyte_pending = 1; - // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx - } else if ((c & 0xF0) == 0xE0) { - multibyte_pending = 2; - // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - } else if ((c & 0xF8) == 0xF0) { - multibyte_pending = 3; - } else { + void rewind() { + params.antiprompt.clear(); + num_tokens_predicted = 0; + generated_text = ""; + generated_text.reserve(params.n_ctx); + stopping_word = ""; multibyte_pending = 0; - } + + n_remain = 0; + n_past = 0; } - if (multibyte_pending > 0 && !has_next_token) { - has_next_token = true; - n_remain++; + bool loadModel(const gpt_params& params_) + { + params = params_; + ctx = llama_init_from_gpt_params(params); + if (ctx == NULL) + { + fprintf(stderr, "%s: error: unable to load model\n", __func__); + return false; + } + + last_n_tokens.resize(params.n_ctx); + std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); + return true; } - if (verbose) { - fprintf(stderr, - "next token: {\n" - " token: %d,\n" - " token_text: \"%s\",\n" - " has_next_token: %d,\n" - " n_remain: %ld,\n" - " num_tokens_predicted: %ld,\n" - " stopping_word: \"%s\",\n" - "}\n", - token, debug_str(llama_token_to_str(ctx, token)).c_str(), has_next_token, n_remain, num_tokens_predicted, - debug_str(stopping_word).c_str()); + void loadPrompt() { + params.prompt.insert(0, 1, ' '); // always add a first space + std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); + + if (params.n_keep < 0) { + params.n_keep = (int)prompt_tokens.size(); + } + params.n_keep = std::min(params.n_ctx - 4, params.n_keep); + + // if input prompt is too big, truncate like normal + if (prompt_tokens.size() >= (size_t)params.n_ctx) { + const int n_left = (params.n_ctx - params.n_keep) / 2; + std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); + const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_left - 1) / n_left; + new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end()); + std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin()); + + if (verbose) { + fprintf(stderr, + "input truncated: {\n" + " n_ctx: %d,\n" + " n_keep: %d,\n" + " n_left: %d,\n" + " new_tokens: \"%s\",\n" + "}\n", + params.n_ctx, params.n_keep, n_left, + debug_str(tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())).c_str()); + } + + prompt_tokens = new_tokens; + } else { + const size_t ps = prompt_tokens.size(); + std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0); + std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps); + } + + // compare the evaluated prompt with the new prompt + n_past = common_part(embd, prompt_tokens); + embd = prompt_tokens; + if (n_past == prompt_tokens.size()) { + // we have to evaluate at least 1 token to generate logits. + n_past--; + } + + if (verbose) { + fprintf(stderr, + "prompt: {\n" + " n_past: %zu,\n" + " cached: \"%s\",\n" + " to_eval: \"%s\",\n" + "}\n", + n_past, + debug_str(tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)).c_str(), + debug_str(tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())).c_str()); + } + + has_next_token = true; } - return token_text; - } + void beginCompletion() + { + // number of tokens to keep when resetting context + n_remain = params.n_predict; + llama_set_rng_seed(ctx, params.seed); + } + + llama_token nextToken() { + llama_token result = -1; + + if (embd.size() >= (size_t)params.n_ctx) { + // Reset context + const int n_left = (params.n_ctx - params.n_keep) / 2; + + std::vector new_tokens(embd.begin(), embd.begin() + params.n_keep); + new_tokens.insert(new_tokens.end(), embd.end() - n_left, embd.end()); + embd = new_tokens; + n_past = params.n_keep; + if (verbose) { + fprintf(stderr, + "input truncated: {\n" + " n_ctx: %d,\n" + " n_keep: %d,\n" + " n_left: %d,\n" + " new_tokens: \"%s\",\n" + "}\n", + params.n_ctx, params.n_keep, n_left, + debug_str(tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())).c_str()); + } + } + + while (n_past < embd.size()) + { + int n_eval = (int)embd.size() - n_past; + if (n_eval > params.n_batch) + { + n_eval = params.n_batch; + } + if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads)) + { + fprintf(stderr, "%s : failed to eval\n", __func__); + has_next_token = false; + return result; + } + n_past += n_eval; + } + + // out of user input, sample next token + const float temp = params.temp; + const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; + const float top_p = params.top_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; + const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n; + const float repeat_penalty = params.repeat_penalty; + const float alpha_presence = params.presence_penalty; + const float alpha_frequency = params.frequency_penalty; + const int mirostat = params.mirostat; + const float mirostat_tau = params.mirostat_tau; + const float mirostat_eta = params.mirostat_eta; + const bool penalize_nl = params.penalize_nl; + llama_token id = 0; + { + auto* logits = llama_get_logits(ctx); + auto n_vocab = llama_n_vocab(ctx); + + // Apply params.logit_bias map + for (const auto& it : params.logit_bias) { + logits[it.first] += it.second; + } + + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) + { + candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + // Apply penalties + float nl_logit = logits[llama_token_nl()]; + auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx); + llama_sample_repetition_penalty(ctx, &candidates_p, + last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + last_n_repeat, repeat_penalty); + llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, + last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + last_n_repeat, alpha_frequency, alpha_presence); + if (!penalize_nl) + { + logits[llama_token_nl()] = nl_logit; + } + + if (temp <= 0) + { + // Greedy sampling + id = llama_sample_token_greedy(ctx, &candidates_p); + } else { + if (mirostat == 1) + { + static float mirostat_mu = 2.0f * mirostat_tau; + const int mirostat_m = 100; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); + } else if (mirostat == 2) { + static float mirostat_mu = 2.0f * mirostat_tau; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); + } else { + // Temperature sampling + llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); + llama_sample_typical(ctx, &candidates_p, typical_p, 1); + llama_sample_top_p(ctx, &candidates_p, top_p, 1); + llama_sample_top_k(ctx, &candidates_p, top_k, 1); + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token(ctx, &candidates_p); + } + } + last_n_tokens.erase(last_n_tokens.begin()); + last_n_tokens.push_back(id); + num_tokens_predicted++; + } + + // add it to the context + embd.push_back(id); + result = id; + // decrement remaining sampling budget + --n_remain; + + if (!embd.empty() && embd.back() == llama_token_eos()) { + stopping_word = llama_token_to_str(ctx, embd.back()); + has_next_token = false; + if (verbose) { + fprintf(stderr, "eos token found!\n"); + } + return result; + } + + has_next_token = params.n_predict == -1 || n_remain != 0; + return result; + } + + size_t findStoppingStrings(const std::string& text, const size_t last_token_size, + const stop_type type) + { + size_t stop_pos = std::string::npos; + for (const std::string& word : params.antiprompt) { + size_t pos; + if (type == STOP_FULL) { + const size_t tmp = word.size() + last_token_size; + const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; + pos = text.find(word, from_pos); + } + else { + pos = find_partial_stop_string(word, text); + } + if (pos != std::string::npos && + (stop_pos == std::string::npos || pos < stop_pos)) { + if (type == STOP_FULL) { + stopping_word = word; + has_next_token = false; + } + stop_pos = pos; + } + } + return stop_pos; + } + + std::string doCompletion() + { + llama_token token = nextToken(); + + std::string token_text = token == -1 ? "" : llama_token_to_str(ctx, token); + generated_text += token_text; + + if (multibyte_pending > 0) { + multibyte_pending -= token_text.size(); + } else if (token_text.size() == 1) { + const char c = token_text[0]; + // 2-byte characters: 110xxxxx 10xxxxxx + if ((c & 0xE0) == 0xC0) { + multibyte_pending = 1; + // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx + } else if ((c & 0xF0) == 0xE0) { + multibyte_pending = 2; + // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + } else if ((c & 0xF8) == 0xF0) { + multibyte_pending = 3; + } else { + multibyte_pending = 0; + } + } + + if (multibyte_pending > 0 && !has_next_token) { + has_next_token = true; + n_remain++; + } + + if (verbose) { + fprintf(stderr, + "next token: {\n" + " token: %d,\n" + " token_text: \"%s\",\n" + " has_next_token: %d,\n" + " n_remain: %ld,\n" + " num_tokens_predicted: %ld,\n" + " stopping_word: \"%s\",\n" + "}\n", + token, debug_str(llama_token_to_str(ctx, token)).c_str(), has_next_token, n_remain, num_tokens_predicted, + debug_str(stopping_word).c_str()); + } + + return token_text; + } }; using namespace httplib; using json = nlohmann::json; -void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms, const server_params &sparams) +void server_print_usage(int /*argc*/, char** argv, const gpt_params& params, const server_params& sparams) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -v, --verbose verbose output (default: false)\n"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); - fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); - fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); - if (llama_mlock_supported()) - { - fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); - } - if (llama_mmap_supported()) - { - fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); - } + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -v, --verbose verbose output (default: false)\n"); + fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); + if (llama_mlock_supported()) + { + fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); + } + if (llama_mmap_supported()) + { + fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + } #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); - fprintf(stderr, " number of layers to store in VRAM\n"); - fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); - fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); + fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); + fprintf(stderr, " number of layers to store in VRAM\n"); + fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); + fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); #endif - fprintf(stderr, " -m FNAME, --model FNAME\n"); - fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); - fprintf(stderr, " -a ALIAS, --alias ALIAS\n"); - fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n"); - fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); - fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); - fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); - fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port); - fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); - fprintf(stderr, "\n"); + fprintf(stderr, " -m FNAME, --model FNAME\n"); + fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, " -a ALIAS, --alias ALIAS\n"); + fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n"); + fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); + fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port); + fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); + fprintf(stderr, "\n"); } -void server_params_parse(int argc, char **argv, server_params &sparams, - gpt_params ¶ms) +void server_params_parse(int argc, char** argv, server_params& sparams, + gpt_params& params) { - gpt_params default_params; - server_params default_sparams; - std::string arg; - bool invalid_param = false; + gpt_params default_params; + server_params default_sparams; + std::string arg; + bool invalid_param = false; - for (int i = 1; i < argc; i++) { - arg = argv[i]; - if (arg == "--port") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.port = std::stoi(argv[i]); - } else if (arg == "--host") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.hostname = argv[i]; - } else if (arg == "--timeout" || arg == "-to") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.read_timeout = std::stoi(argv[i]); - sparams.write_timeout = std::stoi(argv[i]); - } else if (arg == "-m" || arg == "--model") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.model = argv[i]; - } else if (arg == "-a" || arg == "--alias") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.model_alias = argv[i]; - } else if (arg == "-h" || arg == "--help") { - server_print_usage(argc, argv, default_params, default_sparams); - exit(0); - } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_ctx = std::stoi(argv[i]); - } else if (arg == "--memory-f32" || arg == "--memory_f32") { - params.memory_f16 = false; - } else if (arg == "--threads" || arg == "-t") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads = std::stoi(argv[i]); - } else if (arg == "-b" || arg == "--batch-size") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_batch = std::stoi(argv[i]); - params.n_batch = std::min(512, params.n_batch); - } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { - if (++i >= argc) { - invalid_param = true; - break; - } + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg == "--port") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.port = std::stoi(argv[i]); + } else if (arg == "--host") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.hostname = argv[i]; + } else if (arg == "--timeout" || arg == "-to") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.read_timeout = std::stoi(argv[i]); + sparams.write_timeout = std::stoi(argv[i]); + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model = argv[i]; + } else if (arg == "-a" || arg == "--alias") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model_alias = argv[i]; + } else if (arg == "-h" || arg == "--help") { + server_print_usage(argc, argv, default_params, default_sparams); + exit(0); + } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_ctx = std::stoi(argv[i]); + } else if (arg == "--memory-f32" || arg == "--memory_f32") { + params.memory_f16 = false; + } else if (arg == "--threads" || arg == "-t") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads = std::stoi(argv[i]); + } else if (arg == "-b" || arg == "--batch-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_batch = std::stoi(argv[i]); + params.n_batch = std::min(512, params.n_batch); + } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { + invalid_param = true; + break; + } #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - params.n_gpu_layers = std::stoi(argv[i]); + params.n_gpu_layers = std::stoi(argv[i]); #else - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); #endif - } - else if (arg == "--tensor-split" || arg == "-ts") - { - if (++i >= argc) - { - invalid_param = true; - break; } + else if (arg == "--tensor-split" || arg == "-ts") + { + if (++i >= argc) + { + invalid_param = true; + break; + } #ifdef GGML_USE_CUBLAS - std::string arg_next = argv[i]; + std::string arg_next = argv[i]; - // split string by , and / - const std::regex regex{ R"([,/]+)" }; - std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; - std::vector split_arg{ it, {} }; - GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); + // split string by , and / + const std::regex regex{ R"([,/]+)" }; + std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; + std::vector split_arg{ it, {} }; + GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); - for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) - { - if (i < split_arg.size()) + for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { - params.tensor_split[i] = std::stof(split_arg[i]); + if (i < split_arg.size()) + { + params.tensor_split[i] = std::stof(split_arg[i]); + } + else + { + params.tensor_split[i] = 0.0f; + } } - else - { - params.tensor_split[i] = 0.0f; - } - } #else - fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); + fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); #endif // GGML_USE_CUBLAS - } - else if (arg == "--main-gpu" || arg == "-mg") - { - if (++i >= argc) + } + else if (arg == "--main-gpu" || arg == "-mg") { - invalid_param = true; - break; - } + if (++i >= argc) + { + invalid_param = true; + break; + } #ifdef GGML_USE_CUBLAS - params.main_gpu = std::stoi(argv[i]); + params.main_gpu = std::stoi(argv[i]); #else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); #endif - } else if (arg == "--lora") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_adapter = argv[i]; - params.use_mmap = false; - } else if (arg == "--lora-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_base = argv[i]; - } else if (arg == "-v" || arg == "--verbose") { - sparams.verbose = true; - } else if (arg == "--mlock") { - params.use_mlock = true; - } else if (arg == "--no-mmap") { - params.use_mmap = false; - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - server_print_usage(argc, argv, default_params, default_sparams); - exit(1); - } - } - - if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - server_print_usage(argc, argv, default_params, default_sparams); - exit(1); - } -} - -json format_generation_settings(llama_server_context &llama) { - const auto eos_bias = llama.params.logit_bias.find(llama_token_eos()); - const bool ignore_eos = eos_bias != llama.params.logit_bias.end() && - eos_bias->second < 0.0f && std::isinf(eos_bias->second); - - return json { - { "seed", llama.params.seed }, - { "temp", llama.params.temp }, - { "top_k", llama.params.top_k }, - { "top_p", llama.params.top_p }, - { "tfs_z", llama.params.tfs_z }, - { "typical_p", llama.params.typical_p }, - { "repeat_last_n", llama.params.repeat_last_n }, - { "repeat_penalty", llama.params.repeat_penalty }, - { "presence_penalty", llama.params.presence_penalty }, - { "frequency_penalty", llama.params.frequency_penalty }, - { "mirostat", llama.params.mirostat }, - { "mirostat_tau", llama.params.mirostat_tau }, - { "mirostat_eta", llama.params.mirostat_eta }, - { "penalize_nl", llama.params.penalize_nl }, - { "stop", llama.params.antiprompt }, - { "n_predict", llama.params.n_predict }, - { "n_keep", llama.params.n_keep }, - { "ignore_eos", ignore_eos }, - { "stream", llama.stream }, - { "logit_bias", llama.params.logit_bias }, - }; -} - -bool parse_options_completion(json body, llama_server_context& llama, Response &res) -{ - gpt_params default_params; - if (!body["stream"].is_null()) { - llama.stream = body["stream"].get(); - } else { - llama.stream = false; - } - if (!body["n_predict"].is_null()) { - llama.params.n_predict = body["n_predict"].get(); - } else { - llama.params.n_predict = default_params.n_predict; - } - if (!body["top_k"].is_null()) { - llama.params.top_k = body["top_k"].get(); - } else { - llama.params.top_k = default_params.top_k; - } - if (!body["top_p"].is_null()) { - llama.params.top_p = body["top_p"].get(); - } else { - llama.params.top_p = default_params.top_p; - } - if (!body["tfs_z"].is_null()) { - llama.params.tfs_z = body["tfs_z"].get(); - } else { - llama.params.tfs_z = default_params.tfs_z; - } - if (!body["typical_p"].is_null()) { - llama.params.typical_p = body["typical_p"].get(); - } else { - llama.params.typical_p = default_params.typical_p; - } - if (!body["repeat_last_n"].is_null()) { - llama.params.repeat_last_n = body["repeat_last_n"].get(); - } else { - llama.params.repeat_last_n = default_params.repeat_last_n; - } - if (!body["temperature"].is_null()) { - llama.params.temp = body["temperature"].get(); - } else { - llama.params.temp = default_params.temp; - } - if (!body["repeat_penalty"].is_null()) { - llama.params.repeat_penalty = body["repeat_penalty"].get(); - } else { - llama.params.repeat_penalty = default_params.repeat_penalty; - } - if (!body["presence_penalty"].is_null()) { - llama.params.presence_penalty = body["presence_penalty"].get(); - } else { - llama.params.presence_penalty = default_params.presence_penalty; - } - if (!body["frequency_penalty"].is_null()) { - llama.params.frequency_penalty = body["frequency_penalty"].get(); - } else { - llama.params.frequency_penalty = default_params.frequency_penalty; - } - if (!body["mirostat"].is_null()) { - llama.params.mirostat = body["mirostat"].get(); - } else { - llama.params.mirostat = default_params.mirostat; - } - if (!body["mirostat_tau"].is_null()) { - llama.params.mirostat_tau = body["mirostat_tau"].get(); - } else { - llama.params.mirostat_tau = default_params.mirostat_tau; - } - if (!body["mirostat_eta"].is_null()) { - llama.params.mirostat_eta = body["mirostat_eta"].get(); - } else { - llama.params.mirostat_eta = default_params.mirostat_eta; - } - if (!body["penalize_nl"].is_null()) { - llama.params.penalize_nl = body["penalize_nl"].get(); - } else { - llama.params.penalize_nl = default_params.penalize_nl; - } - if (!body["n_keep"].is_null()) { - llama.params.n_keep = body["n_keep"].get(); - } else { - llama.params.n_keep = default_params.n_keep; - } - if (!body["seed"].is_null()) { - llama.params.seed = body["seed"].get(); - } else { - llama.params.seed = time(NULL); - } - - llama.params.logit_bias.clear(); - if (!body["ignore_eos"].is_null() && body["ignore_eos"].get()) { - llama.params.logit_bias[llama_token_eos()] = -INFINITY; - } - if (body["logit_bias"].is_array()) { - int n_vocab = llama_n_vocab(llama.ctx); - for (const auto &el : body["logit_bias"]) { - if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) { - llama_token tok = el[0].get(); - if (tok >= 0 && tok < n_vocab) { - if (el[1].is_number_float()) { - llama.params.logit_bias[tok] = el[1].get(); - } else if (el[1].is_boolean() && !el[1].get()) { - llama.params.logit_bias[tok] = -INFINITY; - } + } else if (arg == "--lora") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_adapter = argv[i]; + params.use_mmap = false; + } else if (arg == "--lora-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_base = argv[i]; + } else if (arg == "-v" || arg == "--verbose") { + sparams.verbose = true; + } else if (arg == "--mlock") { + params.use_mlock = true; + } else if (arg == "--no-mmap") { + params.use_mmap = false; + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + server_print_usage(argc, argv, default_params, default_sparams); + exit(1); } - } } - } - if (!body["prompt"].is_null()) { - llama.params.prompt = body["prompt"].get(); - } else { - json data = {{"status", "error"}, {"reason", "You need to provide a prompt"}}; - res.set_content(data.dump(llama.json_indent), "application/json"); - res.status = 400; - return false; - } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + server_print_usage(argc, argv, default_params, default_sparams); + exit(1); + } +} - llama.params.antiprompt.clear(); - if (!body["stop"].is_null()) { - const auto stop = body["stop"].get>(); - std::copy_if(stop.begin(), stop.end(), - std::back_inserter(llama.params.antiprompt), - [](const std::string &str) { return !str.empty(); }); - } +json format_generation_settings(llama_server_context& llama) { + const auto eos_bias = llama.params.logit_bias.find(llama_token_eos()); + const bool ignore_eos = eos_bias != llama.params.logit_bias.end() && + eos_bias->second < 0.0f && std::isinf(eos_bias->second); - if (llama.verbose) { - json tmp = format_generation_settings(llama); - fprintf(stderr, + return json{ + { "seed", llama.params.seed }, + { "temp", llama.params.temp }, + { "top_k", llama.params.top_k }, + { "top_p", llama.params.top_p }, + { "tfs_z", llama.params.tfs_z }, + { "typical_p", llama.params.typical_p }, + { "repeat_last_n", llama.params.repeat_last_n }, + { "repeat_penalty", llama.params.repeat_penalty }, + { "presence_penalty", llama.params.presence_penalty }, + { "frequency_penalty", llama.params.frequency_penalty }, + { "mirostat", llama.params.mirostat }, + { "mirostat_tau", llama.params.mirostat_tau }, + { "mirostat_eta", llama.params.mirostat_eta }, + { "penalize_nl", llama.params.penalize_nl }, + { "stop", llama.params.antiprompt }, + { "n_predict", llama.params.n_predict }, + { "n_keep", llama.params.n_keep }, + { "ignore_eos", ignore_eos }, + { "stream", llama.stream }, + { "logit_bias", llama.params.logit_bias }, + }; +} + +bool parse_options_completion(json body, llama_server_context& llama, Response& res) +{ + gpt_params default_params; + if (!body["stream"].is_null()) { + llama.stream = body["stream"].get(); + } else { + llama.stream = false; + } + if (!body["n_predict"].is_null()) { + llama.params.n_predict = body["n_predict"].get(); + } else { + llama.params.n_predict = default_params.n_predict; + } + if (!body["top_k"].is_null()) { + llama.params.top_k = body["top_k"].get(); + } else { + llama.params.top_k = default_params.top_k; + } + if (!body["top_p"].is_null()) { + llama.params.top_p = body["top_p"].get(); + } else { + llama.params.top_p = default_params.top_p; + } + if (!body["tfs_z"].is_null()) { + llama.params.tfs_z = body["tfs_z"].get(); + } else { + llama.params.tfs_z = default_params.tfs_z; + } + if (!body["typical_p"].is_null()) { + llama.params.typical_p = body["typical_p"].get(); + } else { + llama.params.typical_p = default_params.typical_p; + } + if (!body["repeat_last_n"].is_null()) { + llama.params.repeat_last_n = body["repeat_last_n"].get(); + } else { + llama.params.repeat_last_n = default_params.repeat_last_n; + } + if (!body["temperature"].is_null()) { + llama.params.temp = body["temperature"].get(); + } else { + llama.params.temp = default_params.temp; + } + if (!body["repeat_penalty"].is_null()) { + llama.params.repeat_penalty = body["repeat_penalty"].get(); + } else { + llama.params.repeat_penalty = default_params.repeat_penalty; + } + if (!body["presence_penalty"].is_null()) { + llama.params.presence_penalty = body["presence_penalty"].get(); + } else { + llama.params.presence_penalty = default_params.presence_penalty; + } + if (!body["frequency_penalty"].is_null()) { + llama.params.frequency_penalty = body["frequency_penalty"].get(); + } else { + llama.params.frequency_penalty = default_params.frequency_penalty; + } + if (!body["mirostat"].is_null()) { + llama.params.mirostat = body["mirostat"].get(); + } else { + llama.params.mirostat = default_params.mirostat; + } + if (!body["mirostat_tau"].is_null()) { + llama.params.mirostat_tau = body["mirostat_tau"].get(); + } else { + llama.params.mirostat_tau = default_params.mirostat_tau; + } + if (!body["mirostat_eta"].is_null()) { + llama.params.mirostat_eta = body["mirostat_eta"].get(); + } else { + llama.params.mirostat_eta = default_params.mirostat_eta; + } + if (!body["penalize_nl"].is_null()) { + llama.params.penalize_nl = body["penalize_nl"].get(); + } else { + llama.params.penalize_nl = default_params.penalize_nl; + } + if (!body["n_keep"].is_null()) { + llama.params.n_keep = body["n_keep"].get(); + } else { + llama.params.n_keep = default_params.n_keep; + } + if (!body["seed"].is_null()) { + llama.params.seed = body["seed"].get(); + } else { + llama.params.seed = time(NULL); + } + + llama.params.logit_bias.clear(); + if (!body["ignore_eos"].is_null() && body["ignore_eos"].get()) { + llama.params.logit_bias[llama_token_eos()] = -INFINITY; + } + if (body["logit_bias"].is_array()) { + int n_vocab = llama_n_vocab(llama.ctx); + for (const auto& el : body["logit_bias"]) { + if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) { + llama_token tok = el[0].get(); + if (tok >= 0 && tok < n_vocab) { + if (el[1].is_number_float()) { + llama.params.logit_bias[tok] = el[1].get(); + } else if (el[1].is_boolean() && !el[1].get()) { + llama.params.logit_bias[tok] = -INFINITY; + } + } + } + } + } + + if (!body["prompt"].is_null()) { + llama.params.prompt = body["prompt"].get(); + } else { + json data = { {"status", "error"}, {"reason", "You need to provide a prompt"} }; + res.set_content(data.dump(llama.json_indent), "application/json"); + res.status = 400; + return false; + } + + llama.params.antiprompt.clear(); + if (!body["stop"].is_null()) { + const auto stop = body["stop"].get>(); + std::copy_if(stop.begin(), stop.end(), + std::back_inserter(llama.params.antiprompt), + [](const std::string& str) { return !str.empty(); }); + } + + if (llama.verbose) { + json tmp = format_generation_settings(llama); + fprintf(stderr, "-------------------------\n" "completion parameters: %s\n" "full prompt: \"%s\"\n", tmp.dump(4, ' ', false, json::error_handler_t::replace).c_str(), debug_str(llama.params.prompt).c_str()); - } + } - return true; + return true; } -int main(int argc, char **argv) +int main(int argc, char** argv) { - // own arguments required by this example - gpt_params params; - server_params sparams; + // own arguments required by this example + gpt_params params; + server_params sparams; - // struct that contains llama context and inference - llama_server_context llama; - params.model = "ggml-model.bin"; + // struct that contains llama context and inference + llama_server_context llama; + params.model = "ggml-model.bin"; - server_params_parse(argc, argv, sparams, params); + server_params_parse(argc, argv, sparams, params); - llama.verbose = sparams.verbose; - llama.json_indent = sparams.verbose ? 4 : -1; + llama.verbose = sparams.verbose; + llama.json_indent = sparams.verbose ? 4 : -1; - if (params.model_alias == "unknown") { - params.model_alias = params.model; - } + if (params.model_alias == "unknown") { + params.model_alias = params.model; + } - llama_init_backend(); + llama_init_backend(); - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); - fprintf(stderr, "system_info: n_threads = %d / %d | %s\n\n", params.n_threads, - std::thread::hardware_concurrency(), llama_print_system_info()); + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n\n", params.n_threads, + std::thread::hardware_concurrency(), llama_print_system_info()); - // load the model - if (!llama.loadModel(params)) - { - return 1; - } + // load the model + if (!llama.loadModel(params)) + { + return 1; + } - Server svr; + Server svr; - svr.set_default_headers({ - {"Access-Control-Allow-Origin", "*"}, - {"Access-Control-Allow-Headers", "content-type"} - }); + svr.set_default_headers({ + {"Access-Control-Allow-Origin", "*"}, + {"Access-Control-Allow-Headers", "content-type"} + }); - svr.Get("/", [](const Request &, Response &res) - { res.set_content("

llama.cpp server works

", "text/html"); }); + svr.Get("/", [](const Request&, Response& res) + { res.set_content("

llama.cpp server works

", "text/html"); }); - svr.Post("/completion", [&llama](const Request &req, Response &res) { + svr.Post("/completion", [&llama](const Request& req, Response& res) { - llama.rewind(); - llama_reset_timings(llama.ctx); + llama.rewind(); + llama_reset_timings(llama.ctx); - if (!parse_options_completion(json::parse(req.body), llama, res)) { - return; - } + if (!parse_options_completion(json::parse(req.body), llama, res)) { + return; + } - llama.loadPrompt(); - llama.beginCompletion(); + llama.loadPrompt(); + llama.beginCompletion(); - if (!llama.stream) { - size_t stop_pos = std::string::npos; + if (!llama.stream) { + size_t stop_pos = std::string::npos; - while (llama.has_next_token) { - const std::string token_text = llama.doCompletion(); + while (llama.has_next_token) { + const std::string token_text = llama.doCompletion(); - stop_pos = llama.findStoppingStrings(llama.generated_text, - token_text.size(), STOP_FULL); - } + stop_pos = llama.findStoppingStrings(llama.generated_text, + token_text.size(), STOP_FULL); + } - if (stop_pos == std::string::npos) { - stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL); - } - if (stop_pos != std::string::npos) { - llama.generated_text.erase(llama.generated_text.begin() + stop_pos, - llama.generated_text.end()); - } + if (stop_pos == std::string::npos) { + stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL); + } + if (stop_pos != std::string::npos) { + llama.generated_text.erase(llama.generated_text.begin() + stop_pos, + llama.generated_text.end()); + } - json data = {{"content", llama.generated_text}, - {"stop", true}, - {"model", llama.params.model_alias}, - {"tokens_predicted", llama.num_tokens_predicted}, - {"generation_settings", format_generation_settings(llama)}, - {"prompt", llama.params.prompt}, - {"stopping_word", llama.stopping_word}}; + json data = { {"content", llama.generated_text}, + {"stop", true}, + {"model", llama.params.model_alias}, + {"tokens_predicted", llama.num_tokens_predicted}, + {"generation_settings", format_generation_settings(llama)}, + {"prompt", llama.params.prompt}, + {"stopping_word", llama.stopping_word} }; - llama_print_timings(llama.ctx); + llama_print_timings(llama.ctx); - res.set_content( - data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), - "application/json"); - } else { - const auto chunked_content_provider = [&](size_t, DataSink &sink) { - size_t sent_count = 0; + res.set_content( + data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), + "application/json"); + } + else { + const auto chunked_content_provider = [&](size_t, DataSink& sink) { + size_t sent_count = 0; - while (llama.has_next_token) { - const std::string token_text = llama.doCompletion(); - if (llama.multibyte_pending > 0) { - continue; - } + while (llama.has_next_token) { + const std::string token_text = llama.doCompletion(); + if (llama.multibyte_pending > 0) { + continue; + } - size_t pos = std::min(sent_count, llama.generated_text.size()); + size_t pos = std::min(sent_count, llama.generated_text.size()); - const char *str_test = llama.generated_text.c_str() + pos; - size_t stop_pos = - llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL); - if (stop_pos != std::string::npos) { - llama.generated_text.erase( - llama.generated_text.begin() + pos + stop_pos, - llama.generated_text.end()); - pos = std::min(sent_count, llama.generated_text.size()); - } else { - stop_pos = llama.findStoppingStrings(str_test, token_text.size(), - STOP_PARTIAL); - } + const char* str_test = llama.generated_text.c_str() + pos; + size_t stop_pos = + llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL); + if (stop_pos != std::string::npos) { + llama.generated_text.erase( + llama.generated_text.begin() + pos + stop_pos, + llama.generated_text.end()); + pos = std::min(sent_count, llama.generated_text.size()); + } else { + stop_pos = llama.findStoppingStrings(str_test, token_text.size(), + STOP_PARTIAL); + } - std::string to_send = llama.generated_text.substr(pos, stop_pos); - sent_count += to_send.size(); + std::string to_send = llama.generated_text.substr(pos, stop_pos); + sent_count += to_send.size(); - json data; - if (llama.has_next_token) { - data = {{"content", to_send}, {"stop", false}}; - } else { - // Generation is done, send extra information. - data = { - {"content", to_send}, - {"stop", true}, - {"model", llama.params.model_alias}, - {"tokens_predicted", llama.num_tokens_predicted}, - {"generation_settings", format_generation_settings(llama)}, - {"prompt", llama.params.prompt}, - {"stopping_word", llama.stopping_word}, - {"generated_text", llama.generated_text}}; - } + json data; + if (llama.has_next_token) { + data = { {"content", to_send}, {"stop", false} }; + } else { + // Generation is done, send extra information. + data = { + {"content", to_send}, + {"stop", true}, + {"model", llama.params.model_alias}, + {"tokens_predicted", llama.num_tokens_predicted}, + {"generation_settings", format_generation_settings(llama)}, + {"prompt", llama.params.prompt}, + {"stopping_word", llama.stopping_word}, + {"generated_text", llama.generated_text} }; + } - std::string str = - "data: " + - data.dump(llama.has_next_token ? -1 : llama.json_indent, ' ', false, - json::error_handler_t::replace) + - "\n\n"; + std::string str = + "data: " + + data.dump(llama.has_next_token ? -1 : llama.json_indent, ' ', false, + json::error_handler_t::replace) + + "\n\n"; - if (llama.verbose) { - fprintf(stderr, "to_send=%s", str.c_str()); - } + if (llama.verbose) { + fprintf(stderr, "to_send=%s", str.c_str()); + } - if (!sink.write(str.data(), str.size())) { - if (llama.verbose) { - fprintf(stderr, "stream closed\n"); - } - llama_print_timings(llama.ctx); - return false; - } - } + if (!sink.write(str.data(), str.size())) { + if (llama.verbose) { + fprintf(stderr, "stream closed\n"); + } + llama_print_timings(llama.ctx); + return false; + } + } - llama_print_timings(llama.ctx); - sink.done(); - return true; - }; - res.set_chunked_content_provider("text/event-stream", chunked_content_provider); - } - }); + llama_print_timings(llama.ctx); + sink.done(); + return true; + }; + res.set_chunked_content_provider("text/event-stream", chunked_content_provider); + } + }); - svr.Options(R"(/.*)", [](const Request &, Response &res) - { - return res.set_content("", "application/json"); - }); + svr.Options(R"(/.*)", [](const Request&, Response& res) + { + return res.set_content("", "application/json"); + }); - svr.Post("/tokenize", [&llama](const Request &req, Response &res) - { - json body = json::parse(req.body); - json data = { - {"tokens", ::llama_tokenize(llama.ctx, body["content"].get(), false) } }; - return res.set_content(data.dump(llama.json_indent), "application/json"); - }); + svr.Post("/tokenize", [&llama](const Request& req, Response& res) + { + json body = json::parse(req.body); + json data = { + {"tokens", ::llama_tokenize(llama.ctx, body["content"].get(), false) } }; + return res.set_content(data.dump(llama.json_indent), "application/json"); + }); - svr.set_logger([](const Request& req, const Response& res) { - json log = { - { "status", res.status }, - { "path", req.path }, - { "request", req.body }, - { "response", res.body }, - }; - fprintf(stdout, "http_request: %s\n", - log.dump(-1, ' ', false, json::error_handler_t::replace).c_str()); - }); + svr.set_logger([](const Request& req, const Response& res) { + json log = { + { "status", res.status }, + { "path", req.path }, + { "request", req.body }, + { "response", res.body }, + }; + fprintf(stdout, "http_request: %s\n", + log.dump(-1, ' ', false, json::error_handler_t::replace).c_str()); + }); - svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) { - const auto *fmt = "500 Internal Server Error\n%s"; - char buf[BUFSIZ]; - try { - std::rethrow_exception(std::move(ep)); - } catch (std::exception &e) { - snprintf(buf, sizeof(buf), fmt, e.what()); - } catch (...) { - snprintf(buf, sizeof(buf), fmt, "Unknown Exception"); - } - res.set_content(buf, "text/plain"); - res.status = 500; - }); + svr.set_exception_handler([](const Request&, Response& res, std::exception_ptr ep) { + const auto* fmt = "500 Internal Server Error\n%s"; + char buf[BUFSIZ]; + try { + std::rethrow_exception(std::move(ep)); + } + catch (std::exception& e) { + snprintf(buf, sizeof(buf), fmt, e.what()); + } + catch (...) { + snprintf(buf, sizeof(buf), fmt, "Unknown Exception"); + } + res.set_content(buf, "text/plain"); + res.status = 500; + }); - // set timeouts and change hostname and port - svr.set_read_timeout(sparams.read_timeout); - svr.set_write_timeout(sparams.write_timeout); + // set timeouts and change hostname and port + svr.set_read_timeout(sparams.read_timeout); + svr.set_write_timeout(sparams.write_timeout); - if (!svr.bind_to_port(sparams.hostname, sparams.port)) { - fprintf(stderr, "%s: ERROR: couldn't bind server to %s:%i\n", __func__, - sparams.hostname.c_str(), sparams.port); - return 1; - } + if (!svr.bind_to_port(sparams.hostname, sparams.port)) { + fprintf(stderr, "%s: ERROR: couldn't bind server to %s:%i\n", __func__, + sparams.hostname.c_str(), sparams.port); + return 1; + } - fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, - sparams.hostname.c_str(), sparams.port); - if (!svr.listen_after_bind()) { - return 1; - } + fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, + sparams.hostname.c_str(), sparams.port); + if (!svr.listen_after_bind()) { + return 1; + } - return 0; + return 0; }