Merge branch 'ggerganov:master' into master

This commit is contained in:
hsnmkls 2024-02-06 17:24:00 +08:00 committed by GitHub
commit 16ecbc9a02
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 64 additions and 31 deletions

View file

@ -515,10 +515,14 @@ class HfVocab:
# Yield token text, score, and type # Yield token text, score, and type
yield token_text, self.get_token_score(token_id), self.get_token_type( yield token_text, self.get_token_score(token_id), self.get_token_type(
token_id, self.special_ids # Reuse already stored special IDs token_id, token_text, self.special_ids # Reuse already stored special IDs
) )
def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType: def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
# Special case for byte tokens
if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
return gguf.TokenType.BYTE
# Determine token type based on whether it's a special token # Determine token type based on whether it's a special token
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
@ -530,7 +534,7 @@ class HfVocab:
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list: for text in self.added_tokens_list:
if text in self.specials: if text in self.specials:
toktype = self.get_token_type(self.specials[text], self.special_ids) toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
score = self.get_token_score(self.specials[text]) score = self.get_token_score(self.specials[text])
else: else:
toktype = gguf.TokenType.USER_DEFINED toktype = gguf.TokenType.USER_DEFINED

View file

@ -137,6 +137,10 @@ node index.js
`temperature`: Adjust the randomness of the generated text (default: 0.8). `temperature`: Adjust the randomness of the generated text (default: 0.8).
`dynatemp_range`: Dynamic temperature range (default: 0.0, 0.0 = disabled).
`dynatemp_exponent`: Dynamic temperature exponent (default: 1.0).
`top_k`: Limit the next token selection to the K most probable tokens (default: 40). `top_k`: Limit the next token selection to the K most probable tokens (default: 40).
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95). `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).

View file

@ -432,6 +432,7 @@ struct llama_server_context
} }
default_generation_settings_for_props = get_formated_generation(slots.front()); default_generation_settings_for_props = get_formated_generation(slots.front());
default_generation_settings_for_props["num_slots"] = params.n_parallel;
default_generation_settings_for_props["seed"] = -1; default_generation_settings_for_props["seed"] = -1;
batch = llama_batch_init(n_ctx, 0, params.n_parallel); batch = llama_batch_init(n_ctx, 0, params.n_parallel);
@ -524,27 +525,29 @@ struct llama_server_context
slot->oaicompat_model = ""; slot->oaicompat_model = "";
} }
slot->params.stream = json_value(data, "stream", false); slot->params.stream = json_value(data, "stream", false);
slot->params.cache_prompt = json_value(data, "cache_prompt", false); slot->params.cache_prompt = json_value(data, "cache_prompt", false);
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict); slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k); slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p); slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p); slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p); slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp); slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n); slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat); slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq); slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present); slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat); slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau); slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
slot->params.seed = json_value(data, "seed", default_params.seed); slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); slot->params.seed = json_value(data, "seed", default_params.seed);
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
// infill // infill
if (data.count("input_prefix") != 0) if (data.count("input_prefix") != 0)
@ -1002,6 +1005,8 @@ struct llama_server_context
{"model", params.model_alias}, {"model", params.model_alias},
{"seed", slot.params.seed}, {"seed", slot.params.seed},
{"temperature", slot.sparams.temp}, {"temperature", slot.sparams.temp},
{"dynatemp_range", slot.sparams.dynatemp_range},
{"dynatemp_exponent", slot.sparams.dynatemp_exponent},
{"top_k", slot.sparams.top_k}, {"top_k", slot.sparams.top_k},
{"top_p", slot.sparams.top_p}, {"top_p", slot.sparams.top_p},
{"min_p", slot.sparams.min_p}, {"min_p", slot.sparams.min_p},
@ -1163,13 +1168,30 @@ struct llama_server_context
task.multitask_id = multitask_id; task.multitask_id = multitask_id;
// when a completion task's prompt array is not a singleton, we split it into multiple requests // when a completion task's prompt array is not a singleton, we split it into multiple requests
if (task.data.count("prompt") && task.data.at("prompt").size() > 1)
{
split_multiprompt_task(task_id, task);
}
// otherwise, it's a single-prompt task, we actually queue it // otherwise, it's a single-prompt task, we actually queue it
queue_tasks.post(task); // if there's numbers in the prompt array it will be treated as an array of tokens
if (task.data.count("prompt") != 0 && task.data.at("prompt").size() > 1) {
bool numbers = false;
for (const auto& e : task.data.at("prompt")) {
if (e.is_number()) {
numbers = true;
break;
}
}
// NOTE: split_multiprompt_task() does not handle a mix of strings and numbers,
// it will completely stall the server. I don't know where the bug for this is.
//
// if there are numbers, it needs to be treated like a single prompt,
// queue_tasks handles a mix of strings and numbers just fine.
if (numbers) {
queue_tasks.post(task);
} else {
split_multiprompt_task(task_id, task);
}
} else {
queue_tasks.post(task);
}
} }
// for multiple images processing // for multiple images processing
@ -1251,7 +1273,10 @@ struct llama_server_context
void split_multiprompt_task(int multitask_id, task_server& multiprompt_task) void split_multiprompt_task(int multitask_id, task_server& multiprompt_task)
{ {
int prompt_count = multiprompt_task.data.at("prompt").size(); int prompt_count = multiprompt_task.data.at("prompt").size();
assert(prompt_count > 1); if (prompt_count <= 1) {
send_error(multiprompt_task, "error while handling multiple prompts");
return;
}
// generate all the ID for subtask // generate all the ID for subtask
std::vector<int> subtask_ids(prompt_count); std::vector<int> subtask_ids(prompt_count);