Merge branch 'master' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # CMakeLists.txt # Makefile # README.md
This commit is contained in:
commit
dcc426e2de
3 changed files with 23 additions and 11 deletions
|
@ -61,7 +61,7 @@ struct llama_server_context
|
||||||
std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
|
std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
// compare the evaluated prompt with the new prompt
|
// compare the evaluated prompt with the new prompt
|
||||||
int new_prompt_len = 0;
|
int new_prompt_len = 0;
|
||||||
for (int i = 0;i < prompt_tokens.size(); i++) {
|
for (size_t i = 0; i < prompt_tokens.size(); i++) {
|
||||||
if (i < processed_tokens.size() &&
|
if (i < processed_tokens.size() &&
|
||||||
processed_tokens[i] == prompt_tokens[i])
|
processed_tokens[i] == prompt_tokens[i])
|
||||||
{
|
{
|
||||||
|
@ -71,7 +71,7 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
embd_inp.push_back(prompt_tokens[i]);
|
embd_inp.push_back(prompt_tokens[i]);
|
||||||
if(new_prompt_len == 0) {
|
if(new_prompt_len == 0) {
|
||||||
if(i - 1 < n_past) {
|
if(int32_t(i) - 1 < n_past) {
|
||||||
processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end());
|
processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end());
|
||||||
}
|
}
|
||||||
// Evaluate the new fragment prompt from the last token processed.
|
// Evaluate the new fragment prompt from the last token processed.
|
||||||
|
@ -136,7 +136,7 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
// out of user input, sample next token
|
// out of user input, sample next token
|
||||||
const float temp = params.temp;
|
const float temp = params.temp;
|
||||||
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
|
// const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
|
||||||
const float top_p = params.top_p;
|
const float top_p = params.top_p;
|
||||||
const float tfs_z = params.tfs_z;
|
const float tfs_z = params.tfs_z;
|
||||||
const float typical_p = params.typical_p;
|
const float typical_p = params.typical_p;
|
||||||
|
@ -306,12 +306,12 @@ struct llama_server_context
|
||||||
// Avoid add the no show words to the response
|
// Avoid add the no show words to the response
|
||||||
for (std::vector<llama_token> word_tokens : no_show_words)
|
for (std::vector<llama_token> word_tokens : no_show_words)
|
||||||
{
|
{
|
||||||
int match_token = 1;
|
size_t match_token = 1;
|
||||||
if (tokens_predicted.front() == word_tokens.front())
|
if (tokens_predicted.front() == word_tokens.front())
|
||||||
{
|
{
|
||||||
bool execute_matching = true;
|
bool execute_matching = true;
|
||||||
if (tokens_predicted.size() > 1) { // if previus tokens had been tested
|
if (tokens_predicted.size() > 1) { // if previus tokens had been tested
|
||||||
for (int i = 1; i < word_tokens.size(); i++)
|
for (size_t i = 1; i < word_tokens.size(); i++)
|
||||||
{
|
{
|
||||||
if (i >= tokens_predicted.size()) {
|
if (i >= tokens_predicted.size()) {
|
||||||
match_token = i;
|
match_token = i;
|
||||||
|
@ -601,7 +601,7 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
Server svr;
|
Server svr;
|
||||||
|
|
||||||
svr.Get("/", [](const Request &req, Response &res)
|
svr.Get("/", [](const Request &, Response &res)
|
||||||
{ res.set_content("<h1>llama.cpp server works</h1>", "text/html"); });
|
{ res.set_content("<h1>llama.cpp server works</h1>", "text/html"); });
|
||||||
|
|
||||||
svr.Post("/completion", [&llama](const Request &req, Response &res)
|
svr.Post("/completion", [&llama](const Request &req, Response &res)
|
||||||
|
@ -649,7 +649,7 @@ int main(int argc, char **argv)
|
||||||
{"tokens_predicted", llama.num_tokens_predicted}};
|
{"tokens_predicted", llama.num_tokens_predicted}};
|
||||||
return res.set_content(data.dump(), "application/json");
|
return res.set_content(data.dump(), "application/json");
|
||||||
}
|
}
|
||||||
catch (json::exception e)
|
catch (const json::exception &e)
|
||||||
{
|
{
|
||||||
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
|
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
|
||||||
json data = {
|
json data = {
|
||||||
|
@ -701,7 +701,7 @@ int main(int argc, char **argv)
|
||||||
{"content", result },
|
{"content", result },
|
||||||
{"stop", !llama.has_next_token }};
|
{"stop", !llama.has_next_token }};
|
||||||
return res.set_content(data.dump(), "application/json");
|
return res.set_content(data.dump(), "application/json");
|
||||||
} catch (json::exception e) {
|
} catch (const json::exception &e) {
|
||||||
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
|
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
|
||||||
json data = {
|
json data = {
|
||||||
{"content", "" },
|
{"content", "" },
|
||||||
|
|
12
ggml.c
12
ggml.c
|
@ -3808,6 +3808,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
||||||
return wtype;
|
return wtype;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t ggml_tensor_overhead(void) {
|
||||||
|
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
|
||||||
|
}
|
||||||
|
|
||||||
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
||||||
return tensor->nb[0] > tensor->nb[1];
|
return tensor->nb[0] > tensor->nb[1];
|
||||||
}
|
}
|
||||||
|
@ -14527,6 +14531,14 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name) {
|
struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name) {
|
||||||
|
for (int i = 0; i < cgraph->n_leafs; i++) {
|
||||||
|
struct ggml_tensor * leaf = cgraph->leafs[i];
|
||||||
|
|
||||||
|
if (strcmp(leaf->name, name) == 0) {
|
||||||
|
return leaf;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = cgraph->nodes[i];
|
struct ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
|
|
6
ggml.h
6
ggml.h
|
@ -380,9 +380,6 @@ extern "C" {
|
||||||
|
|
||||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
|
|
||||||
// use this to compute the memory overhead of a tensor
|
|
||||||
static const size_t GGML_TENSOR_OVERHEAD = (GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16);
|
|
||||||
|
|
||||||
// computation graph
|
// computation graph
|
||||||
struct ggml_cgraph {
|
struct ggml_cgraph {
|
||||||
int n_nodes;
|
int n_nodes;
|
||||||
|
@ -444,6 +441,9 @@ extern "C" {
|
||||||
// TODO: temporary until model loading of ggml examples is refactored
|
// TODO: temporary until model loading of ggml examples is refactored
|
||||||
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
||||||
|
|
||||||
|
// use this to compute the memory overhead of a tensor
|
||||||
|
GGML_API size_t ggml_tensor_overhead(void);
|
||||||
|
|
||||||
// main
|
// main
|
||||||
|
|
||||||
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue