Merge branch 'ggerganov:master' into master
This commit is contained in:
commit
16ecbc9a02
3 changed files with 64 additions and 31 deletions
10
convert.py
10
convert.py
|
@ -515,10 +515,14 @@ class HfVocab:
|
||||||
|
|
||||||
# Yield token text, score, and type
|
# Yield token text, score, and type
|
||||||
yield token_text, self.get_token_score(token_id), self.get_token_type(
|
yield token_text, self.get_token_score(token_id), self.get_token_type(
|
||||||
token_id, self.special_ids # Reuse already stored special IDs
|
token_id, token_text, self.special_ids # Reuse already stored special IDs
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
|
def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
|
||||||
|
# Special case for byte tokens
|
||||||
|
if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
|
||||||
|
return gguf.TokenType.BYTE
|
||||||
|
|
||||||
# Determine token type based on whether it's a special token
|
# Determine token type based on whether it's a special token
|
||||||
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
|
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
|
||||||
|
|
||||||
|
@ -530,7 +534,7 @@ class HfVocab:
|
||||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
for text in self.added_tokens_list:
|
for text in self.added_tokens_list:
|
||||||
if text in self.specials:
|
if text in self.specials:
|
||||||
toktype = self.get_token_type(self.specials[text], self.special_ids)
|
toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
|
||||||
score = self.get_token_score(self.specials[text])
|
score = self.get_token_score(self.specials[text])
|
||||||
else:
|
else:
|
||||||
toktype = gguf.TokenType.USER_DEFINED
|
toktype = gguf.TokenType.USER_DEFINED
|
||||||
|
|
|
@ -137,6 +137,10 @@ node index.js
|
||||||
|
|
||||||
`temperature`: Adjust the randomness of the generated text (default: 0.8).
|
`temperature`: Adjust the randomness of the generated text (default: 0.8).
|
||||||
|
|
||||||
|
`dynatemp_range`: Dynamic temperature range (default: 0.0, 0.0 = disabled).
|
||||||
|
|
||||||
|
`dynatemp_exponent`: Dynamic temperature exponent (default: 1.0).
|
||||||
|
|
||||||
`top_k`: Limit the next token selection to the K most probable tokens (default: 40).
|
`top_k`: Limit the next token selection to the K most probable tokens (default: 40).
|
||||||
|
|
||||||
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
|
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
|
||||||
|
|
|
@ -432,6 +432,7 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
|
|
||||||
default_generation_settings_for_props = get_formated_generation(slots.front());
|
default_generation_settings_for_props = get_formated_generation(slots.front());
|
||||||
|
default_generation_settings_for_props["num_slots"] = params.n_parallel;
|
||||||
default_generation_settings_for_props["seed"] = -1;
|
default_generation_settings_for_props["seed"] = -1;
|
||||||
|
|
||||||
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
|
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
|
||||||
|
@ -524,27 +525,29 @@ struct llama_server_context
|
||||||
slot->oaicompat_model = "";
|
slot->oaicompat_model = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
slot->params.stream = json_value(data, "stream", false);
|
slot->params.stream = json_value(data, "stream", false);
|
||||||
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||||
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
||||||
slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
||||||
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
||||||
slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
|
slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
|
||||||
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
|
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
|
||||||
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
|
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
|
||||||
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
||||||
slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
|
slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
|
||||||
slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
|
slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
|
||||||
slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
|
slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
|
||||||
slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
|
slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
|
||||||
slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
|
slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
|
||||||
slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
|
slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
|
||||||
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
|
slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
|
||||||
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
|
slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
|
||||||
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
|
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
|
||||||
slot->params.seed = json_value(data, "seed", default_params.seed);
|
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
|
||||||
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
|
||||||
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
slot->params.seed = json_value(data, "seed", default_params.seed);
|
||||||
|
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||||
|
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||||
|
|
||||||
// infill
|
// infill
|
||||||
if (data.count("input_prefix") != 0)
|
if (data.count("input_prefix") != 0)
|
||||||
|
@ -1002,6 +1005,8 @@ struct llama_server_context
|
||||||
{"model", params.model_alias},
|
{"model", params.model_alias},
|
||||||
{"seed", slot.params.seed},
|
{"seed", slot.params.seed},
|
||||||
{"temperature", slot.sparams.temp},
|
{"temperature", slot.sparams.temp},
|
||||||
|
{"dynatemp_range", slot.sparams.dynatemp_range},
|
||||||
|
{"dynatemp_exponent", slot.sparams.dynatemp_exponent},
|
||||||
{"top_k", slot.sparams.top_k},
|
{"top_k", slot.sparams.top_k},
|
||||||
{"top_p", slot.sparams.top_p},
|
{"top_p", slot.sparams.top_p},
|
||||||
{"min_p", slot.sparams.min_p},
|
{"min_p", slot.sparams.min_p},
|
||||||
|
@ -1163,13 +1168,30 @@ struct llama_server_context
|
||||||
task.multitask_id = multitask_id;
|
task.multitask_id = multitask_id;
|
||||||
|
|
||||||
// when a completion task's prompt array is not a singleton, we split it into multiple requests
|
// when a completion task's prompt array is not a singleton, we split it into multiple requests
|
||||||
if (task.data.count("prompt") && task.data.at("prompt").size() > 1)
|
|
||||||
{
|
|
||||||
split_multiprompt_task(task_id, task);
|
|
||||||
}
|
|
||||||
|
|
||||||
// otherwise, it's a single-prompt task, we actually queue it
|
// otherwise, it's a single-prompt task, we actually queue it
|
||||||
queue_tasks.post(task);
|
// if there's numbers in the prompt array it will be treated as an array of tokens
|
||||||
|
if (task.data.count("prompt") != 0 && task.data.at("prompt").size() > 1) {
|
||||||
|
bool numbers = false;
|
||||||
|
for (const auto& e : task.data.at("prompt")) {
|
||||||
|
if (e.is_number()) {
|
||||||
|
numbers = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: split_multiprompt_task() does not handle a mix of strings and numbers,
|
||||||
|
// it will completely stall the server. I don't know where the bug for this is.
|
||||||
|
//
|
||||||
|
// if there are numbers, it needs to be treated like a single prompt,
|
||||||
|
// queue_tasks handles a mix of strings and numbers just fine.
|
||||||
|
if (numbers) {
|
||||||
|
queue_tasks.post(task);
|
||||||
|
} else {
|
||||||
|
split_multiprompt_task(task_id, task);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
queue_tasks.post(task);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// for multiple images processing
|
// for multiple images processing
|
||||||
|
@ -1251,7 +1273,10 @@ struct llama_server_context
|
||||||
void split_multiprompt_task(int multitask_id, task_server& multiprompt_task)
|
void split_multiprompt_task(int multitask_id, task_server& multiprompt_task)
|
||||||
{
|
{
|
||||||
int prompt_count = multiprompt_task.data.at("prompt").size();
|
int prompt_count = multiprompt_task.data.at("prompt").size();
|
||||||
assert(prompt_count > 1);
|
if (prompt_count <= 1) {
|
||||||
|
send_error(multiprompt_task, "error while handling multiple prompts");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// generate all the ID for subtask
|
// generate all the ID for subtask
|
||||||
std::vector<int> subtask_ids(prompt_count);
|
std::vector<int> subtask_ids(prompt_count);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue