Merge branch 'master' into compilade/lazy-convert-hf
This commit is contained in:
commit
bffdaf4010
43 changed files with 1646 additions and 179 deletions
|
@ -62,6 +62,18 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
|
|||
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||
- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
|
||||
- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
|
||||
- `--rope-scaling` : RoPE scaling method. Defaults to linear unless otherwise specified by the model. Options are `none`, `linear`, `yarn`
|
||||
- `--rope-freq-base N` : RoPE frequency base (default: loaded from model)
|
||||
- `--rope-freq-scale N`: RoPE frequency scaling factor, expands context by a factor of 1/N (e.g. 0.25)
|
||||
- `--yarn-ext-factor N` : YaRN: extrapolation mix factor (Default: 1.0, 0.0 = full interpolation)
|
||||
- `--yarn-attn-factor N` : YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
|
||||
- `--yarn-beta-slow N`: YaRN: High correction dim or alpha (default: 1.0)
|
||||
- `--yarn-beta-fast N`: YaRN: low correction dim or beta (default: 32.0)
|
||||
- `--pooling` : Pooling type for embeddings, use model default if unspecified. Options are `none`, `mean`, `cls`
|
||||
- `-dt N`, `--defrag-thold N`: KV cache defragmentation threshold (default: -1.0, < 0 = disabled)
|
||||
- `-fa`, `--flash-attn` : enable flash attention (default: disabled).
|
||||
- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
|
||||
- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
|
||||
|
||||
**If compiled with `LLAMA_SERVER_SSL=ON`**
|
||||
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
|
||||
|
@ -260,7 +272,7 @@ node index.js
|
|||
|
||||
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`
|
||||
|
||||
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token. Default: `0`
|
||||
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0`
|
||||
|
||||
`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
|
||||
|
||||
|
@ -319,7 +331,7 @@ Notice that each `probs` is an array of length `n_probs`.
|
|||
|
||||
`content`: Set the text to tokenize.
|
||||
|
||||
Note that a special `BOS` token is never inserted.
|
||||
`add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
|
||||
|
||||
- **POST** `/detokenize`: Convert tokens to text.
|
||||
|
||||
|
|
|
@ -2266,17 +2266,31 @@ struct server_context {
|
|||
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
|
||||
result.tok = id;
|
||||
|
||||
const int32_t n_probs = slot.sparams.n_probs;
|
||||
if (slot.sparams.temp <= 0 && n_probs > 0) {
|
||||
// for llama_sample_token_greedy we need to sort candidates
|
||||
llama_sample_softmax(ctx, &cur_p);
|
||||
}
|
||||
const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
|
||||
if (n_probs > 0) {
|
||||
const size_t n_considered = slot.ctx_sampling->n_considered;
|
||||
|
||||
for (size_t i = 0; i < std::min(cur_p.size, (size_t) n_probs); ++i) {
|
||||
result.probs.push_back({
|
||||
cur_p.data[i].id,
|
||||
cur_p.data[i].p
|
||||
});
|
||||
// Make sure at least n_probs top tokens are at the front of the vector:
|
||||
if (slot.sparams.temp == 0.0f && n_probs > n_considered) {
|
||||
llama_sample_top_k(ctx, &cur_p, n_probs, 0);
|
||||
}
|
||||
|
||||
if (slot.sparams.temp == 0.0f) {
|
||||
// With greedy sampling the probabilities have possibly not been calculated.
|
||||
for (size_t i = 0; i < n_probs; ++i) {
|
||||
result.probs.push_back({
|
||||
cur_p.data[i].id,
|
||||
i == 0 ? 1.0f : 0.0f
|
||||
});
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < n_probs; ++i) {
|
||||
result.probs.push_back({
|
||||
cur_p.data[i].id,
|
||||
i >= n_considered ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!process_token(result, slot)) {
|
||||
|
@ -3633,7 +3647,8 @@ int main(int argc, char ** argv) {
|
|||
|
||||
std::vector<llama_token> tokens;
|
||||
if (body.count("content") != 0) {
|
||||
tokens = ctx_server.tokenize(body["content"], false);
|
||||
const bool add_special = json_value(body, "add_special", false);
|
||||
tokens = ctx_server.tokenize(body["content"], add_special);
|
||||
}
|
||||
const json data = format_tokenizer_response(tokens);
|
||||
return res.set_content(data.dump(), "application/json; charset=utf-8");
|
||||
|
|
|
@ -7,6 +7,7 @@ Feature: llama.cpp server
|
|||
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||
And a model file test-model.gguf
|
||||
And a model alias tinyllama-2
|
||||
And BOS token is 1
|
||||
And 42 as server seed
|
||||
# KV Cache corresponds to the total amount of tokens
|
||||
# that can be stored across all independent sequences: #4130
|
||||
|
@ -91,7 +92,18 @@ Feature: llama.cpp server
|
|||
"""
|
||||
What is the capital of France ?
|
||||
"""
|
||||
Then tokens can be detokenize
|
||||
Then tokens can be detokenized
|
||||
And tokens do not begin with BOS
|
||||
|
||||
Scenario: Tokenize w/ BOS
|
||||
Given adding special tokens
|
||||
When tokenizing:
|
||||
"""
|
||||
What is the capital of Germany?
|
||||
"""
|
||||
Then tokens begin with BOS
|
||||
Given first token is removed
|
||||
Then tokens can be detokenized
|
||||
|
||||
Scenario: Models available
|
||||
Given available models
|
||||
|
|
|
@ -376,6 +376,11 @@ def step_seed(context, seed):
|
|||
context.seed.append(seed)
|
||||
|
||||
|
||||
@step('BOS token is {bos:d}')
|
||||
def step_bos_token(context, bos):
|
||||
context.bos = bos
|
||||
|
||||
|
||||
@step('a prefix prompt')
|
||||
def step_prompt_prefix(context):
|
||||
context.prompt_prefix = context_text(context)
|
||||
|
@ -656,21 +661,29 @@ async def all_embeddings_are_generated(context):
|
|||
assert_embeddings(context.tasks_result.pop().pop())
|
||||
|
||||
|
||||
@step('adding special tokens')
|
||||
def step_tokenize_set_add_special(context):
|
||||
context.tokenize_add_special = True
|
||||
|
||||
|
||||
@step('tokenizing')
|
||||
@async_run_until_complete
|
||||
async def step_tokenize(context):
|
||||
context.tokenized_text = context_text(context)
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tokenize_args = {
|
||||
"content": context.tokenized_text,
|
||||
}
|
||||
if getattr(context, 'tokenize_add_special', None) is not None:
|
||||
tokenize_args['add_special'] = context.tokenize_add_special
|
||||
async with session.post(f'{context.base_url}/tokenize',
|
||||
json={
|
||||
"content": context.tokenized_text,
|
||||
}) as response:
|
||||
json=tokenize_args) as response:
|
||||
assert response.status == 200
|
||||
tokenize_json = await response.json()
|
||||
context.tokens = tokenize_json['tokens']
|
||||
|
||||
|
||||
@step('tokens can be detokenize')
|
||||
@step('tokens can be detokenized')
|
||||
@async_run_until_complete
|
||||
async def step_detokenize(context):
|
||||
assert len(context.tokens) > 0
|
||||
|
@ -685,6 +698,21 @@ async def step_detokenize(context):
|
|||
assert context.tokenized_text == detokenize_json['content'].strip()
|
||||
|
||||
|
||||
@step('tokens begin with BOS')
|
||||
def step_strings_for_tokenization(context):
|
||||
assert context.tokens[0] == context.bos
|
||||
|
||||
|
||||
@step('tokens do not begin with BOS')
|
||||
def step_strings_for_tokenization(context):
|
||||
assert context.tokens[0] != context.bos
|
||||
|
||||
|
||||
@step('first token is removed')
|
||||
def step_strings_for_tokenization(context):
|
||||
context.tokens = context.tokens[1:]
|
||||
|
||||
|
||||
@step('an OPTIONS request is sent from {origin}')
|
||||
@async_run_until_complete
|
||||
async def step_options_request(context, origin):
|
||||
|
|
|
@ -49,18 +49,18 @@ extern bool server_log_json;
|
|||
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
|
||||
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra);
|
||||
static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);
|
||||
|
||||
template <typename T>
|
||||
static T json_value(const json &body, const std::string &key, const T &default_value) {
|
||||
static T json_value(const json & body, const std::string & key, const T & default_value) {
|
||||
// Fallback null to default value
|
||||
if (body.contains(key) && !body.at(key).is_null()){
|
||||
if (body.contains(key) && !body.at(key).is_null()) {
|
||||
try {
|
||||
return body.value(key, default_value);
|
||||
}
|
||||
catch (nlohmann::json_abi_v3_11_3::detail::type_error const&){
|
||||
std::string message = "Wrong type supplied for parameter '" + key + "'. Expected '" + typeid(default_value).name() + "', using default value.";
|
||||
server_log("WARN", __func__, __LINE__, message.c_str(), body);
|
||||
return body.at(key);
|
||||
} catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
|
||||
std::stringstream ss;
|
||||
ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value.";
|
||||
LOG_WARNING(ss.str().c_str(), body);
|
||||
return default_value;
|
||||
}
|
||||
} else {
|
||||
|
@ -68,16 +68,16 @@ static T json_value(const json &body, const std::string &key, const T &default_v
|
|||
}
|
||||
}
|
||||
|
||||
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
|
||||
static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) {
|
||||
std::stringstream ss_tid;
|
||||
ss_tid << std::this_thread::get_id();
|
||||
json log = nlohmann::ordered_json{
|
||||
json log = json{
|
||||
{"tid", ss_tid.str()},
|
||||
{"timestamp", time(nullptr)},
|
||||
};
|
||||
|
||||
if (server_log_json) {
|
||||
log.merge_patch( {
|
||||
log.merge_patch({
|
||||
{"level", level},
|
||||
{"function", function},
|
||||
{"line", line},
|
||||
|
@ -98,7 +98,7 @@ static inline void server_log(const char *level, const char *function, int line,
|
|||
}
|
||||
std::stringstream ss;
|
||||
ss << buf << " |";
|
||||
for (const auto& el : log.items())
|
||||
for (const auto & el : log.items())
|
||||
{
|
||||
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
ss << " " << el.key() << "=" << value;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue