Merge branch 'ggerganov:master' into master
This commit is contained in:
commit
16ecbc9a02
3 changed files with 64 additions and 31 deletions
10
convert.py
10
convert.py
|
@ -515,10 +515,14 @@ class HfVocab:
|
|||
|
||||
# Yield token text, score, and type
|
||||
yield token_text, self.get_token_score(token_id), self.get_token_type(
|
||||
token_id, self.special_ids # Reuse already stored special IDs
|
||||
token_id, token_text, self.special_ids # Reuse already stored special IDs
|
||||
)
|
||||
|
||||
def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
|
||||
def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
|
||||
# Special case for byte tokens
|
||||
if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
|
||||
return gguf.TokenType.BYTE
|
||||
|
||||
# Determine token type based on whether it's a special token
|
||||
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
|
||||
|
||||
|
@ -530,7 +534,7 @@ class HfVocab:
|
|||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
for text in self.added_tokens_list:
|
||||
if text in self.specials:
|
||||
toktype = self.get_token_type(self.specials[text], self.special_ids)
|
||||
toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
|
||||
score = self.get_token_score(self.specials[text])
|
||||
else:
|
||||
toktype = gguf.TokenType.USER_DEFINED
|
||||
|
|
|
@ -137,6 +137,10 @@ node index.js
|
|||
|
||||
`temperature`: Adjust the randomness of the generated text (default: 0.8).
|
||||
|
||||
`dynatemp_range`: Dynamic temperature range (default: 0.0, 0.0 = disabled).
|
||||
|
||||
`dynatemp_exponent`: Dynamic temperature exponent (default: 1.0).
|
||||
|
||||
`top_k`: Limit the next token selection to the K most probable tokens (default: 40).
|
||||
|
||||
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
|
||||
|
|
|
@ -432,6 +432,7 @@ struct llama_server_context
|
|||
}
|
||||
|
||||
default_generation_settings_for_props = get_formated_generation(slots.front());
|
||||
default_generation_settings_for_props["num_slots"] = params.n_parallel;
|
||||
default_generation_settings_for_props["seed"] = -1;
|
||||
|
||||
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
|
||||
|
@ -533,6 +534,8 @@ struct llama_server_context
|
|||
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
|
||||
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
|
||||
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
||||
slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
|
||||
slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
|
||||
slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
|
||||
slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
|
||||
slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
|
||||
|
@ -1002,6 +1005,8 @@ struct llama_server_context
|
|||
{"model", params.model_alias},
|
||||
{"seed", slot.params.seed},
|
||||
{"temperature", slot.sparams.temp},
|
||||
{"dynatemp_range", slot.sparams.dynatemp_range},
|
||||
{"dynatemp_exponent", slot.sparams.dynatemp_exponent},
|
||||
{"top_k", slot.sparams.top_k},
|
||||
{"top_p", slot.sparams.top_p},
|
||||
{"min_p", slot.sparams.min_p},
|
||||
|
@ -1163,13 +1168,30 @@ struct llama_server_context
|
|||
task.multitask_id = multitask_id;
|
||||
|
||||
// when a completion task's prompt array is not a singleton, we split it into multiple requests
|
||||
if (task.data.count("prompt") && task.data.at("prompt").size() > 1)
|
||||
{
|
||||
split_multiprompt_task(task_id, task);
|
||||
// otherwise, it's a single-prompt task, we actually queue it
|
||||
// if there's numbers in the prompt array it will be treated as an array of tokens
|
||||
if (task.data.count("prompt") != 0 && task.data.at("prompt").size() > 1) {
|
||||
bool numbers = false;
|
||||
for (const auto& e : task.data.at("prompt")) {
|
||||
if (e.is_number()) {
|
||||
numbers = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// otherwise, it's a single-prompt task, we actually queue it
|
||||
// NOTE: split_multiprompt_task() does not handle a mix of strings and numbers,
|
||||
// it will completely stall the server. I don't know where the bug for this is.
|
||||
//
|
||||
// if there are numbers, it needs to be treated like a single prompt,
|
||||
// queue_tasks handles a mix of strings and numbers just fine.
|
||||
if (numbers) {
|
||||
queue_tasks.post(task);
|
||||
} else {
|
||||
split_multiprompt_task(task_id, task);
|
||||
}
|
||||
} else {
|
||||
queue_tasks.post(task);
|
||||
}
|
||||
}
|
||||
|
||||
// for multiple images processing
|
||||
|
@ -1251,7 +1273,10 @@ struct llama_server_context
|
|||
void split_multiprompt_task(int multitask_id, task_server& multiprompt_task)
|
||||
{
|
||||
int prompt_count = multiprompt_task.data.at("prompt").size();
|
||||
assert(prompt_count > 1);
|
||||
if (prompt_count <= 1) {
|
||||
send_error(multiprompt_task, "error while handling multiple prompts");
|
||||
return;
|
||||
}
|
||||
|
||||
// generate all the ID for subtask
|
||||
std::vector<int> subtask_ids(prompt_count);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue