diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7209a2b52..3904412cb 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -61,7 +61,7 @@ struct llama_server_context std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); // compare the evaluated prompt with the new prompt int new_prompt_len = 0; - for (int i = 0;i < prompt_tokens.size(); i++) { + for (size_t i = 0; i < prompt_tokens.size(); i++) { if (i < processed_tokens.size() && processed_tokens[i] == prompt_tokens[i]) { @@ -71,7 +71,7 @@ struct llama_server_context { embd_inp.push_back(prompt_tokens[i]); if(new_prompt_len == 0) { - if(i - 1 < n_past) { + if(int32_t(i) - 1 < n_past) { processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end()); } // Evaluate the new fragment prompt from the last token processed. @@ -136,7 +136,7 @@ struct llama_server_context { // out of user input, sample next token const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; + // const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; const float top_p = params.top_p; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; @@ -306,12 +306,12 @@ struct llama_server_context // Avoid add the no show words to the response for (std::vector word_tokens : no_show_words) { - int match_token = 1; + size_t match_token = 1; if (tokens_predicted.front() == word_tokens.front()) { bool execute_matching = true; if (tokens_predicted.size() > 1) { // if previus tokens had been tested - for (int i = 1; i < word_tokens.size(); i++) + for (size_t i = 1; i < word_tokens.size(); i++) { if (i >= tokens_predicted.size()) { match_token = i; @@ -601,7 +601,7 @@ int main(int argc, char **argv) Server svr; - svr.Get("/", [](const Request &req, Response &res) + svr.Get("/", [](const Request &, Response &res) { res.set_content("

llama.cpp server works

", "text/html"); }); svr.Post("/completion", [&llama](const Request &req, Response &res) @@ -649,7 +649,7 @@ int main(int argc, char **argv) {"tokens_predicted", llama.num_tokens_predicted}}; return res.set_content(data.dump(), "application/json"); } - catch (json::exception e) + catch (const json::exception &e) { // Some tokens have bad UTF-8 strings, the json parser is very sensitive json data = { @@ -701,7 +701,7 @@ int main(int argc, char **argv) {"content", result }, {"stop", !llama.has_next_token }}; return res.set_content(data.dump(), "application/json"); - } catch (json::exception e) { + } catch (const json::exception &e) { // Some tokens have bad UTF-8 strings, the json parser is very sensitive json data = { {"content", "" }, diff --git a/ggml.c b/ggml.c index f1e690c94..47b104bfb 100644 --- a/ggml.c +++ b/ggml.c @@ -3808,6 +3808,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { return wtype; } +size_t ggml_tensor_overhead(void) { + return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16; +} + static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1]; } @@ -14527,6 +14531,14 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { } struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name) { + for (int i = 0; i < cgraph->n_leafs; i++) { + struct ggml_tensor * leaf = cgraph->leafs[i]; + + if (strcmp(leaf->name, name) == 0) { + return leaf; + } + } + for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; diff --git a/ggml.h b/ggml.h index 0c90f5064..558138280 100644 --- a/ggml.h +++ b/ggml.h @@ -380,9 +380,6 @@ extern "C" { static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); - // use this to compute the memory overhead of a tensor - static const size_t GGML_TENSOR_OVERHEAD = (GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16); - // computation graph struct ggml_cgraph { int n_nodes; @@ -444,6 +441,9 @@ extern "C" { // TODO: temporary until model loading of ggml examples is refactored GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); + // use this to compute the memory overhead of a tensor + GGML_API size_t ggml_tensor_overhead(void); + // main GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);