Merge 71c98cc3bd
into ee52225067
This commit is contained in:
commit
a31473c023
15 changed files with 352 additions and 110 deletions
2
Makefile
2
Makefile
|
@ -824,7 +824,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o ngram-cache.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||||
|
|
||||||
|
|
|
@ -124,6 +124,7 @@ pub fn build(b: *std.build.Builder) !void {
|
||||||
const console = make.obj("console", "common/console.cpp");
|
const console = make.obj("console", "common/console.cpp");
|
||||||
const sampling = make.obj("sampling", "common/sampling.cpp");
|
const sampling = make.obj("sampling", "common/sampling.cpp");
|
||||||
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
||||||
|
const ngram_cache = make.obj("ngram-cache", "common/ngram-cache.cpp");
|
||||||
const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
|
const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
|
||||||
const train = make.obj("train", "common/train.cpp");
|
const train = make.obj("train", "common/train.cpp");
|
||||||
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
||||||
|
@ -136,7 +137,7 @@ pub fn build(b: *std.build.Builder) !void {
|
||||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
|
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
|
||||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
|
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
|
||||||
|
|
||||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
|
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, ngram_cache, clip, llava });
|
||||||
if (server.target.isWindows()) {
|
if (server.target.isWindows()) {
|
||||||
server.linkSystemLibrary("ws2_32");
|
server.linkSystemLibrary("ws2_32");
|
||||||
}
|
}
|
||||||
|
|
|
@ -1594,9 +1594,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -ld LOGDIR, --logdir LOGDIR\n");
|
printf(" -ld LOGDIR, --logdir LOGDIR\n");
|
||||||
printf(" path under which to save YAML logs (no logging if unset)\n");
|
printf(" path under which to save YAML logs (no logging if unset)\n");
|
||||||
printf(" -lcs FNAME, --lookup-cache-static FNAME\n");
|
printf(" -lcs FNAME, --lookup-cache-static FNAME\n");
|
||||||
printf(" path to static lookup cache to use for lookup decoding (not updated by generation)\n");
|
printf(" path to static lookup cache to use for n-gram lookup decoding (not updated by generation)\n");
|
||||||
printf(" -lcd FNAME, --lookup-cache-dynamic FNAME\n");
|
printf(" -lcd FNAME, --lookup-cache-dynamic FNAME\n");
|
||||||
printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
|
printf(" path to dynamic lookup cache to use for n-gram lookup decoding (updated by generation)\n");
|
||||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||||
printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||||
|
|
|
@ -6,19 +6,18 @@
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
|
||||||
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
||||||
std::vector<llama_token> & inp, int nnew, bool print_progress) {
|
llama_token * inp_data, int inp_size, int nnew, bool print_progress) {
|
||||||
const int64_t t_start_ms = ggml_time_ms();
|
const int64_t t_start_ms = ggml_time_ms();
|
||||||
const int64_t inp_size = inp.size();
|
|
||||||
|
|
||||||
const int64_t n_todo = inp_size * (ngram_max - ngram_min + 1);
|
const int64_t n_todo = inp_size * (ngram_max - ngram_min + 1);
|
||||||
int64_t n_done = 0;
|
int64_t n_done = 0;
|
||||||
|
|
||||||
for (int64_t ngram_size = ngram_min; ngram_size <= ngram_max; ++ngram_size) {
|
for (int64_t ngram_size = ngram_min; ngram_size <= ngram_max; ++ngram_size) {
|
||||||
const int64_t i_start = std::max(inp_size - nnew, ngram_size);
|
const int64_t i_start = std::max((int64_t)(inp_size - nnew), ngram_size);
|
||||||
for (int64_t i = i_start; i < inp_size; ++i) {
|
for (int64_t i = i_start; i < inp_size; ++i) {
|
||||||
const int64_t ngram_start = i - ngram_size;
|
const int64_t ngram_start = i - ngram_size;
|
||||||
llama_ngram ngram(&inp[ngram_start], ngram_size);
|
llama_ngram ngram(inp_data + ngram_start, ngram_size);
|
||||||
const llama_token token = inp[i];
|
const llama_token token = inp_data[i];
|
||||||
|
|
||||||
llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
|
llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
|
||||||
if (part_it == ngram_cache.end()) {
|
if (part_it == ngram_cache.end()) {
|
||||||
|
@ -48,8 +47,8 @@ void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, in
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to get a token from the combined, speculative sequence of inp and draft.
|
// Helper function to get a token from the combined, speculative sequence of inp and draft.
|
||||||
static llama_token get_token(const std::vector<llama_token> & inp, const std::vector<llama_token> & draft, const size_t i) {
|
static llama_token get_token(const llama_token * inp_data, const int inp_size, const std::vector<llama_token> & draft, const int i) {
|
||||||
return i < inp.size() ? inp[i] : draft[1 + i - inp.size()];
|
return i < inp_size ? inp_data[i] : draft[1 + i - inp_size];
|
||||||
}
|
}
|
||||||
|
|
||||||
// If sample size or percentage are below these thresholds the draft is aborted early:
|
// If sample size or percentage are below these thresholds the draft is aborted early:
|
||||||
|
@ -140,11 +139,10 @@ static llama_token try_draft(
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_ngram_cache_draft(
|
void llama_ngram_cache_draft(
|
||||||
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
llama_token * inp_data, int inp_size, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
||||||
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static
|
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static
|
||||||
) {
|
) {
|
||||||
GGML_ASSERT(draft.size() == 1);
|
GGML_ASSERT(draft.size() == 1);
|
||||||
const int inp_size = inp.size();
|
|
||||||
|
|
||||||
if (inp_size < LLAMA_NGRAM_STATIC) {
|
if (inp_size < LLAMA_NGRAM_STATIC) {
|
||||||
return;
|
return;
|
||||||
|
@ -156,7 +154,7 @@ void llama_ngram_cache_draft(
|
||||||
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
|
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
|
||||||
llama_ngram ngram_static;
|
llama_ngram ngram_static;
|
||||||
for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
|
for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
|
||||||
ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
|
ngram_static.tokens[j-ngram_start_static] = get_token(inp_data, inp_size, draft, j);
|
||||||
}
|
}
|
||||||
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
||||||
llama_ngram_cache_part part_static;
|
llama_ngram_cache_part part_static;
|
||||||
|
@ -170,7 +168,7 @@ void llama_ngram_cache_draft(
|
||||||
const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
|
const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
|
||||||
llama_ngram ngram_cd;
|
llama_ngram ngram_cd;
|
||||||
for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
|
for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
|
||||||
ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
|
ngram_cd.tokens[j-ngram_start_cd] = get_token(inp_data, inp_size, draft, j);
|
||||||
}
|
}
|
||||||
ngrams_cd.push_back(ngram_cd);
|
ngrams_cd.push_back(ngram_cd);
|
||||||
}
|
}
|
||||||
|
@ -216,12 +214,11 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
|
bool llama_ngram_cache_load(llama_ngram_cache & ngram_cache, std::string & filename) {
|
||||||
std::ifstream hashmap_file(filename, std::ios::binary);
|
std::ifstream hashmap_file(filename, std::ios::binary);
|
||||||
if (!hashmap_file) {
|
if (!hashmap_file) {
|
||||||
throw std::ifstream::failure("Unable to open file " + filename);
|
return false;
|
||||||
}
|
}
|
||||||
llama_ngram_cache ngram_cache;
|
|
||||||
|
|
||||||
llama_ngram ngram;
|
llama_ngram ngram;
|
||||||
int32_t ntokens;
|
int32_t ntokens;
|
||||||
|
@ -251,7 +248,7 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
|
||||||
}
|
}
|
||||||
GGML_ASSERT(hashmap_file.eof());
|
GGML_ASSERT(hashmap_file.eof());
|
||||||
|
|
||||||
return ngram_cache;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
|
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
|
||||||
|
|
|
@ -37,12 +37,21 @@ struct llama_ngram {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct llama_token_hash_function {
|
||||||
|
size_t operator()(const llama_token token) const {
|
||||||
|
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
|
||||||
|
return token * 11400714819323198485llu;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
struct llama_ngram_hash_function {
|
struct llama_ngram_hash_function {
|
||||||
size_t operator()(const llama_ngram & ngram) const {
|
size_t operator()(const llama_ngram & ngram) const {
|
||||||
size_t hash = 0;
|
size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
|
||||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
|
||||||
hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
|
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
|
||||||
|
hash ^= llama_token_hash_function{}(ngram.tokens[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
return hash;
|
return hash;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -64,7 +73,7 @@ typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash
|
||||||
// In order to get correct results inp_data can ONLY BE APPENDED TO.
|
// In order to get correct results inp_data can ONLY BE APPENDED TO.
|
||||||
// Changes in the middle need a complete rebuild.
|
// Changes in the middle need a complete rebuild.
|
||||||
void llama_ngram_cache_update(
|
void llama_ngram_cache_update(
|
||||||
llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
|
llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, llama_token * inp_data, int inp_size, int nnew, bool print_progress);
|
||||||
|
|
||||||
// Try to draft tokens from ngram caches.
|
// Try to draft tokens from ngram caches.
|
||||||
// inp: the tokens generated so far.
|
// inp: the tokens generated so far.
|
||||||
|
@ -75,7 +84,7 @@ void llama_ngram_cache_update(
|
||||||
// nc_dynamic: ngram cache based on previous user generations.
|
// nc_dynamic: ngram cache based on previous user generations.
|
||||||
// nc_static: ngram cache generated from a large text corpus, used for validation.
|
// nc_static: ngram cache generated from a large text corpus, used for validation.
|
||||||
void llama_ngram_cache_draft(
|
void llama_ngram_cache_draft(
|
||||||
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
llama_token * inp_data, int inp_size, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
||||||
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);
|
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);
|
||||||
|
|
||||||
// Save an ngram cache to a file.
|
// Save an ngram cache to a file.
|
||||||
|
@ -84,9 +93,10 @@ void llama_ngram_cache_draft(
|
||||||
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
|
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
|
||||||
|
|
||||||
// Load an ngram cache saved with llama_ngram_cache_save.
|
// Load an ngram cache saved with llama_ngram_cache_save.
|
||||||
|
// ngram_cache: the ngram cache to load the data into.
|
||||||
// filename: the path from which to load the ngram cache.
|
// filename: the path from which to load the ngram cache.
|
||||||
// returns: an ngram cache containing the information saved to filename.
|
// returns: an ngram cache containing the information saved to filename.
|
||||||
llama_ngram_cache llama_ngram_cache_load(std::string & filename);
|
bool llama_ngram_cache_load(llama_ngram_cache & ngram_cache, std::string & filename);
|
||||||
|
|
||||||
// Merge two ngram caches.
|
// Merge two ngram caches.
|
||||||
// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
|
// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
|
||||||
|
|
|
@ -1,13 +1,82 @@
|
||||||
# llama.cpp/examples/lookup
|
# llama.cpp/examples/lookup
|
||||||
|
|
||||||
Demonstration of Prompt Lookup Decoding
|
Demonstration of speculative decoding using n-gram lookup.
|
||||||
|
Initial version was based on https://github.com/apoorvumang/prompt-lookup-decoding .
|
||||||
|
The current version uses three separate types of "n-gram caches".
|
||||||
|
Each of these caches maps how frequently a given n-gram is followed by a specific token.
|
||||||
|
The difference between the caches lies in what data is used to build them:
|
||||||
|
|
||||||
https://github.com/apoorvumang/prompt-lookup-decoding
|
* The "context" cache is built using the tokens in the current context of a user generation.
|
||||||
|
* The "dynamic" cache is built by merging the context caches of previous user generations.
|
||||||
|
* The "static" cache is built from a large text corpus with no relation to the current context.
|
||||||
|
|
||||||
The key parameters for lookup decoding are `ngram_min`, `ngram_max` and `n_draft`. The first two determine the size of the ngrams to search for in the prompt for a match. The latter specifies how many subsequent tokens to draft if a match is found.
|
The tradeoff between these caches lies in relevance to the current context vs. the emount of input data.
|
||||||
|
When trying to draft a new token using n-gram lookup the algorithm is as follows:
|
||||||
|
|
||||||
More info:
|
* Try to draft a suitable token from the context cache. If a static cache is available, use it to validate the draft candidates. This is done by simply multiplying the frequencies of the two caches.
|
||||||
|
* Try to draft a suitable token from the dynamic cache, validate with static cache if available.
|
||||||
|
* Try to draft a suitable token from the static cache.
|
||||||
|
|
||||||
https://github.com/ggerganov/llama.cpp/pull/4484
|
Only a single token sequence with the most likely token candidates is drafted.
|
||||||
https://github.com/ggerganov/llama.cpp/issues/4226
|
All tokens must pass thresholds for frequency and sample size in order to be drafted.
|
||||||
|
|
||||||
|
Relevant command line arguments:
|
||||||
|
|
||||||
|
- `--draft`: maximum number of additional tokens to draft using n-gram lookup. Default: 5. Set to 0 to disable n-gram lookup. **Results are not deterministic with n-gram lookup enabled due to varying batch size.**
|
||||||
|
- `-lcs FNAME, --lookup-cache-static FNAME`: optional path to static lookup cache to use for n-gram lookup. Created from a large, unspecific text corpus using `lookup-create`.
|
||||||
|
- `-lcd FNAME, --lookup-cache-dynamic FNAME`: optional path to dynamic lookup cache to use for n-gram lookup. Contains data from previous generations. Automatically created and filled while the server is running but by default discarded on server exit. Setting this argument tries to initialize the dynamic cache from a file and saves it to said file on server shutdown.
|
||||||
|
|
||||||
|
N-gram lookup caches saved to disk are compatible between models as long as they use the same tokenizer
|
||||||
|
(but for dynamic caches the resulting drafted tokens may be wrong which means there is no speedup).
|
||||||
|
Furthermore, the data format for both types of caches is the same so they can be used interchangeably (but probably not with good results).
|
||||||
|
|
||||||
|
## Usage Examples
|
||||||
|
|
||||||
|
### `lookup`
|
||||||
|
|
||||||
|
Generation using n-gram lookup:
|
||||||
|
|
||||||
|
``` sh
|
||||||
|
./lookup --model models/opt/llama_2-7b-q4_0.gguf -ngl 99 --n-predict 256 --ignore-eos --draft 3 --color --prompt "Write a love story about two stars that tragically ends in a type Ia supernova. Use a lot of emotional and dramatic language."
|
||||||
|
```
|
||||||
|
|
||||||
|
The `--color` flag highlights the successfully predicted tokens.
|
||||||
|
The `--lookup-cache-static` and `--lookup-cache-dynamic` arguments can be set to provide static/dynamic caches.
|
||||||
|
|
||||||
|
### `lookup-stats`
|
||||||
|
|
||||||
|
Determine n-gram lookup effectiveness for a given text corpus (similar to `perplexity`):
|
||||||
|
|
||||||
|
``` sh
|
||||||
|
./lookup-stats --model /opt/models/llama_2-7b-q4_0.gguf --file wikitext-2-raw/wiki.test.raw --draft 3
|
||||||
|
```
|
||||||
|
|
||||||
|
The `--lookup-cache-static` and `--lookup-cache-dynamic` arguments can be set to provide static/dynamic caches.
|
||||||
|
|
||||||
|
### `lookup-create`
|
||||||
|
|
||||||
|
Create a static lookup cache from a text corpus:
|
||||||
|
|
||||||
|
``` sh
|
||||||
|
./lookup-create --model /opt/models/llama_2-7b-q4_0.gguf --lookup-cache-static wt103-llama_2.lcs --file wikitext-103-raw/wiki.train.raw
|
||||||
|
```
|
||||||
|
|
||||||
|
The `--lookup-cache-static` argument must be set to provide the path to which the static lookup cache will be saved.
|
||||||
|
The tokenizer for which to create the cache is taken from the provided model.
|
||||||
|
|
||||||
|
### `lookup-merge`
|
||||||
|
|
||||||
|
Merge two lookup caches into one:
|
||||||
|
|
||||||
|
``` sh
|
||||||
|
./lookup-merge cache_1.lcs cache_2.lcs cache_merged.lcs
|
||||||
|
```
|
||||||
|
|
||||||
|
Can be used for both static and dynamic lookup caches.
|
||||||
|
|
||||||
|
## More info:
|
||||||
|
|
||||||
|
* https://github.com/ggerganov/llama.cpp/pull/4484
|
||||||
|
* https://github.com/ggerganov/llama.cpp/issues/4226
|
||||||
|
* https://github.com/ggerganov/llama.cpp/pull/5479
|
||||||
|
* https://github.com/ggerganov/llama.cpp/pull/6828
|
||||||
|
|
|
@ -34,7 +34,7 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
|
|
||||||
llama_ngram_cache ngram_cache;
|
llama_ngram_cache ngram_cache;
|
||||||
llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
|
llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp.data(), inp.size(), inp.size(), true);
|
||||||
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
||||||
|
|
||||||
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
|
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
|
||||||
|
|
|
@ -33,11 +33,13 @@ int main(int argc, char ** argv){
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
|
fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
|
||||||
llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]);
|
llama_ngram_cache ngram_cache_merged;
|
||||||
|
GGML_ASSERT(llama_ngram_cache_load(ngram_cache_merged, args[0]));
|
||||||
|
|
||||||
for (size_t i = 1; i < args.size()-1; ++i) {
|
for (size_t i = 1; i < args.size()-1; ++i) {
|
||||||
fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
|
fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
|
||||||
llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]);
|
llama_ngram_cache ngram_cache;
|
||||||
|
GGML_ASSERT(llama_ngram_cache_load(ngram_cache, args[i]));
|
||||||
|
|
||||||
llama_ngram_cache_merge(ngram_cache_merged, ngram_cache);
|
llama_ngram_cache_merge(ngram_cache_merged, ngram_cache);
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,7 +30,6 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
|
@ -46,18 +45,15 @@ int main(int argc, char ** argv){
|
||||||
const int64_t t_start_draft_us = ggml_time_us();
|
const int64_t t_start_draft_us = ggml_time_us();
|
||||||
|
|
||||||
if (!params.lookup_cache_static.empty()) {
|
if (!params.lookup_cache_static.empty()) {
|
||||||
try {
|
if(!llama_ngram_cache_load(ngram_cache_static, params.lookup_cache_static)) {
|
||||||
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
|
||||||
} catch (std::ifstream::failure const &) {
|
|
||||||
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.lookup_cache_dynamic.empty()) {
|
if (!params.lookup_cache_dynamic.empty()) {
|
||||||
try {
|
// If the dynamic lookup cache doesn't exist it will be created at the end of the program:
|
||||||
ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
|
llama_ngram_cache_load(ngram_cache_dynamic, params.lookup_cache_dynamic);
|
||||||
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
|
|
||||||
}
|
}
|
||||||
|
|
||||||
t_draft_flat_us += ggml_time_us() - t_start_draft_us;
|
t_draft_flat_us += ggml_time_us() - t_start_draft_us;
|
||||||
|
@ -85,7 +81,9 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
{
|
{
|
||||||
const int64_t t_start_draft_us = ggml_time_us();
|
const int64_t t_start_draft_us = ggml_time_us();
|
||||||
llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
llama_ngram_cache_draft(
|
||||||
|
pseudo_output.data(), pseudo_output.size(), draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX,
|
||||||
|
ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
||||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -104,7 +102,8 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
{
|
{
|
||||||
const int64_t t_start_draft_us = ggml_time_us();
|
const int64_t t_start_draft_us = ggml_time_us();
|
||||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
|
llama_ngram_cache_update(
|
||||||
|
ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output.data(), pseudo_output.size(), 1, false);
|
||||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -114,7 +113,8 @@ int main(int argc, char ** argv){
|
||||||
pseudo_output.push_back(inp_slice[pseudo_output.size()]);
|
pseudo_output.push_back(inp_slice[pseudo_output.size()]);
|
||||||
{
|
{
|
||||||
const int64_t t_start_draft_us = ggml_time_us();
|
const int64_t t_start_draft_us = ggml_time_us();
|
||||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
|
llama_ngram_cache_update(
|
||||||
|
ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output.data(), pseudo_output.size(), 1, false);
|
||||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,7 +38,6 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
|
@ -53,21 +52,18 @@ int main(int argc, char ** argv){
|
||||||
{
|
{
|
||||||
// Fill up context ngram cache with tokens from user input:
|
// Fill up context ngram cache with tokens from user input:
|
||||||
const int64_t t_start_draft_us = ggml_time_us();
|
const int64_t t_start_draft_us = ggml_time_us();
|
||||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
|
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp.data(), inp.size(), inp.size(), false);
|
||||||
|
|
||||||
if (!params.lookup_cache_static.empty()) {
|
if (!params.lookup_cache_static.empty()) {
|
||||||
try {
|
if(!llama_ngram_cache_load(ngram_cache_static, params.lookup_cache_static)) {
|
||||||
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
|
||||||
} catch (std::ifstream::failure const &) {
|
|
||||||
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.lookup_cache_dynamic.empty()) {
|
if (!params.lookup_cache_dynamic.empty()) {
|
||||||
try {
|
// If the dynamic lookup cache doesn't exist it will be created at the end of the program:
|
||||||
ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
|
llama_ngram_cache_load(ngram_cache_dynamic, params.lookup_cache_dynamic);
|
||||||
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
|
|
||||||
}
|
}
|
||||||
|
|
||||||
t_draft_flat_us += ggml_time_us() - t_start_draft_us;
|
t_draft_flat_us += ggml_time_us() - t_start_draft_us;
|
||||||
|
@ -156,7 +152,7 @@ int main(int argc, char ** argv){
|
||||||
{
|
{
|
||||||
// Update context ngram cache with the newly accepted token:
|
// Update context ngram cache with the newly accepted token:
|
||||||
const int64_t t_start_draft_us = ggml_time_us();
|
const int64_t t_start_draft_us = ggml_time_us();
|
||||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp.data(), inp.size(), 1, false);
|
||||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -182,7 +178,7 @@ int main(int argc, char ** argv){
|
||||||
{
|
{
|
||||||
// Update context ngram cache with the newly accepted token:
|
// Update context ngram cache with the newly accepted token:
|
||||||
const int64_t t_start_draft_us = ggml_time_us();
|
const int64_t t_start_draft_us = ggml_time_us();
|
||||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp.data(), inp.size(), 1, false);
|
||||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -204,7 +200,8 @@ int main(int argc, char ** argv){
|
||||||
GGML_ASSERT(draft[0] == inp.back());
|
GGML_ASSERT(draft[0] == inp.back());
|
||||||
const int64_t t_start_draft_us = ggml_time_us();
|
const int64_t t_start_draft_us = ggml_time_us();
|
||||||
|
|
||||||
llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
llama_ngram_cache_draft(
|
||||||
|
inp.data(), inp.size(), draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
||||||
|
|
||||||
for (size_t i = 1; i < draft.size(); ++i) {
|
for (size_t i = 1; i < draft.size(); ++i) {
|
||||||
llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
|
llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
|
||||||
|
|
|
@ -8,6 +8,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
|
||||||
* LLM inference of F16 and quantum models on GPU and CPU
|
* LLM inference of F16 and quantum models on GPU and CPU
|
||||||
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
|
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
|
||||||
* Parallel decoding with multi-user support
|
* Parallel decoding with multi-user support
|
||||||
|
* Speculative decoding based on n-gram lookup
|
||||||
* Continuous batching
|
* Continuous batching
|
||||||
* Multimodal (wip)
|
* Multimodal (wip)
|
||||||
* Monitoring endpoints
|
* Monitoring endpoints
|
||||||
|
@ -49,7 +50,7 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
|
||||||
- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
|
- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
|
||||||
- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
|
- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
|
||||||
- `--embedding`: Enable embedding extraction. Default: disabled
|
- `--embedding`: Enable embedding extraction. Default: disabled
|
||||||
- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`
|
- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`. **Values > 1 produce nondeterministic results depending on the number of active slots.**.
|
||||||
- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled
|
- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled
|
||||||
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
||||||
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
||||||
|
@ -74,6 +75,9 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
|
||||||
- `-fa`, `--flash-attn` : enable flash attention (default: disabled).
|
- `-fa`, `--flash-attn` : enable flash attention (default: disabled).
|
||||||
- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
|
- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
|
||||||
- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
|
- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
|
||||||
|
- `--draft`: maximum number of additional tokens to draft using n-gram lookup. Default: 5. Set to 0 to disable n-gram lookup. **Results are not deterministic with n-gram lookup enabled due to varying batch size.**
|
||||||
|
- `-lcs FNAME, --lookup-cache-static FNAME`: optional path to static lookup cache to use for n-gram lookup. Created from a large, unspecific text corpus using `lookup-create`.
|
||||||
|
- `-lcd FNAME, --lookup-cache-dynamic FNAME`: optional path to dynamic lookup cache to use for n-gram lookup. Contains data from previous generations. Automatically created and filled while the server is running but by default discarded on server exit. Setting this argument tries to initialize the dynamic cache from a file and saves it to said file on server shutdown.
|
||||||
|
|
||||||
**If compiled with `LLAMA_SERVER_SSL=ON`**
|
**If compiled with `LLAMA_SERVER_SSL=ON`**
|
||||||
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
|
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
|
||||||
|
|
|
@ -45,6 +45,9 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True)
|
parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True)
|
||||||
parser.add_argument("--scenario", type=str, help="Scenario to run", required=True)
|
parser.add_argument("--scenario", type=str, help="Scenario to run", required=True)
|
||||||
parser.add_argument("--duration", type=str, help="Bench scenario", required=True)
|
parser.add_argument("--duration", type=str, help="Bench scenario", required=True)
|
||||||
|
parser.add_argument("--draft", type=int, help="Max. number of additional tokens to draft for lookup decoding", required=False, default=5)
|
||||||
|
parser.add_argument("-lcs", "--lookup-cache-static", type=str, help="Path to optional static lookup cache to use.", required=False, default=None)
|
||||||
|
parser.add_argument("-lcd", "--lookup-cache-dynamic", type=str, help="Path to optional dynamic lookup cache to use. Will be overwritten upon server shutdown.", required=False, default=None)
|
||||||
|
|
||||||
args = parser.parse_args(args_in)
|
args = parser.parse_args(args_in)
|
||||||
|
|
||||||
|
@ -270,6 +273,11 @@ def start_server_background(args):
|
||||||
server_args.append('--metrics')
|
server_args.append('--metrics')
|
||||||
server_args.append('--flash-attn')
|
server_args.append('--flash-attn')
|
||||||
server_args.extend(['--log-format', "text"])
|
server_args.extend(['--log-format', "text"])
|
||||||
|
server_args.extend(['--draft', args.draft])
|
||||||
|
if args.lookup_cache_static is not None:
|
||||||
|
server_args.extend(['--lookup-cache-static', args.lookup_cache_static])
|
||||||
|
if args.lookup_cache_dynamic is not None:
|
||||||
|
server_args.extend(['--lookup-cache-dynamic', args.lookup_cache_dynamic])
|
||||||
args = [str(arg) for arg in [server_path, *server_args]]
|
args = [str(arg) for arg in [server_path, *server_args]]
|
||||||
print(f"bench: starting server with: {' '.join(args)}")
|
print(f"bench: starting server with: {' '.join(args)}")
|
||||||
pkwargs = {
|
pkwargs = {
|
||||||
|
|
|
@ -2,8 +2,9 @@
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "json-schema-to-grammar.h"
|
#include "json-schema-to-grammar.h"
|
||||||
#include "llama.h"
|
|
||||||
#include "grammar-parser.h"
|
#include "grammar-parser.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ngram-cache.h"
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
// crash the server in debug mode, otherwise send an http 500 error
|
// crash the server in debug mode, otherwise send an http 500 error
|
||||||
|
@ -165,6 +166,10 @@ struct server_slot {
|
||||||
// when a task is submitted, we first tokenize the prompt and store it here
|
// when a task is submitted, we first tokenize the prompt and store it here
|
||||||
std::vector<llama_token> prompt_tokens;
|
std::vector<llama_token> prompt_tokens;
|
||||||
|
|
||||||
|
llama_ngram_cache nc_context;
|
||||||
|
std::vector<llama_token> draft;
|
||||||
|
std::vector<llama_token> context_tokens;
|
||||||
|
|
||||||
std::string generated_text;
|
std::string generated_text;
|
||||||
std::vector<llama_token> cache_tokens;
|
std::vector<llama_token> cache_tokens;
|
||||||
std::vector<completion_token_output> generated_token_probs;
|
std::vector<completion_token_output> generated_token_probs;
|
||||||
|
@ -220,6 +225,9 @@ struct server_slot {
|
||||||
n_past_se = 0;
|
n_past_se = 0;
|
||||||
|
|
||||||
generated_token_probs.clear();
|
generated_token_probs.clear();
|
||||||
|
|
||||||
|
nc_context.clear();
|
||||||
|
draft.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool has_budget(gpt_params &global_params) {
|
bool has_budget(gpt_params &global_params) {
|
||||||
|
@ -260,7 +268,7 @@ struct server_slot {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
json get_formated_timings() const {
|
json get_formatted_timings() const {
|
||||||
return json {
|
return json {
|
||||||
{"prompt_n", n_prompt_tokens_processed},
|
{"prompt_n", n_prompt_tokens_processed},
|
||||||
{"prompt_ms", t_prompt_processing},
|
{"prompt_ms", t_prompt_processing},
|
||||||
|
@ -425,7 +433,7 @@ struct server_queue {
|
||||||
queue_tasks_deferred.push_back(std::move(task));
|
queue_tasks_deferred.push_back(std::move(task));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the next id for creating anew task
|
// Get the next id for creating a new task
|
||||||
int get_new_id() {
|
int get_new_id() {
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
int new_id = id++;
|
int new_id = id++;
|
||||||
|
@ -541,7 +549,7 @@ struct server_queue {
|
||||||
queue_multitasks.push_back(multi);
|
queue_multitasks.push_back(multi);
|
||||||
}
|
}
|
||||||
|
|
||||||
// updatethe remaining subtasks, while appending results to multitask
|
// update the remaining subtasks, while appending results to multitask
|
||||||
void update_multitask(int id_multi, int id_sub, server_task_result & result) {
|
void update_multitask(int id_multi, int id_sub, server_task_result & result) {
|
||||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||||
for (auto & multitask : queue_multitasks) {
|
for (auto & multitask : queue_multitasks) {
|
||||||
|
@ -574,7 +582,7 @@ struct server_response {
|
||||||
waiting_task_ids.insert(id_task);
|
waiting_task_ids.insert(id_task);
|
||||||
}
|
}
|
||||||
|
|
||||||
// when the request is finished, we can remove task associated with it
|
// when the request is finished, we can remove the task associated with it
|
||||||
void remove_waiting_task_id(int id_task) {
|
void remove_waiting_task_id(int id_task) {
|
||||||
LOG_VERBOSE("remove waiting for task id", {{"id_task", id_task}});
|
LOG_VERBOSE("remove waiting for task id", {{"id_task", id_task}});
|
||||||
|
|
||||||
|
@ -655,6 +663,10 @@ struct server_context {
|
||||||
std::vector<server_slot> slots;
|
std::vector<server_slot> slots;
|
||||||
json default_generation_settings_for_props;
|
json default_generation_settings_for_props;
|
||||||
|
|
||||||
|
int32_t n_draft = 3;
|
||||||
|
llama_ngram_cache nc_dynamic;
|
||||||
|
llama_ngram_cache nc_static;
|
||||||
|
|
||||||
server_queue queue_tasks;
|
server_queue queue_tasks;
|
||||||
server_response queue_results;
|
server_response queue_results;
|
||||||
|
|
||||||
|
@ -715,6 +727,8 @@ struct server_context {
|
||||||
slot.n_ctx = n_ctx_slot;
|
slot.n_ctx = n_ctx_slot;
|
||||||
slot.n_predict = params.n_predict;
|
slot.n_predict = params.n_predict;
|
||||||
|
|
||||||
|
slot.context_tokens.resize(n_ctx_slot);
|
||||||
|
|
||||||
LOG_INFO("new slot", {
|
LOG_INFO("new slot", {
|
||||||
{"id_slot", slot.id},
|
{"id_slot", slot.id},
|
||||||
{"n_ctx_slot", slot.n_ctx}
|
{"n_ctx_slot", slot.n_ctx}
|
||||||
|
@ -745,7 +759,7 @@ struct server_context {
|
||||||
slots.push_back(slot);
|
slots.push_back(slot);
|
||||||
}
|
}
|
||||||
|
|
||||||
default_generation_settings_for_props = get_formated_generation(slots.front());
|
default_generation_settings_for_props = get_formatted_generation(slots.front());
|
||||||
default_generation_settings_for_props["seed"] = -1;
|
default_generation_settings_for_props["seed"] = -1;
|
||||||
|
|
||||||
// the update_slots() logic will always submit a maximum of n_batch tokens
|
// the update_slots() logic will always submit a maximum of n_batch tokens
|
||||||
|
@ -1066,6 +1080,13 @@ struct server_context {
|
||||||
for (int i = 0; i < (int)system_tokens.size(); ++i) {
|
for (int i = 0; i < (int)system_tokens.size(); ++i) {
|
||||||
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||||
}
|
}
|
||||||
|
if (n_draft > 0) {
|
||||||
|
for (auto slot : slots) {
|
||||||
|
memcpy(slot.context_tokens.data(), system_tokens.data(), system_tokens.size()*sizeof(llama_token));
|
||||||
|
llama_ngram_cache_update(
|
||||||
|
slot.nc_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, system_tokens.data(), system_tokens.size(), system_tokens.size(), false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const int32_t n_batch = llama_n_batch(ctx);
|
const int32_t n_batch = llama_n_batch(ctx);
|
||||||
|
|
||||||
|
@ -1243,7 +1264,7 @@ struct server_context {
|
||||||
return slot.has_next_token; // continue
|
return slot.has_next_token; // continue
|
||||||
}
|
}
|
||||||
|
|
||||||
json get_formated_generation(const server_slot & slot) const {
|
json get_formatted_generation(const server_slot & slot) const {
|
||||||
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
|
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
|
||||||
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
||||||
|
|
||||||
|
@ -1365,7 +1386,7 @@ struct server_context {
|
||||||
{"model", params.model_alias},
|
{"model", params.model_alias},
|
||||||
{"tokens_predicted", slot.n_decoded},
|
{"tokens_predicted", slot.n_decoded},
|
||||||
{"tokens_evaluated", slot.n_prompt_tokens},
|
{"tokens_evaluated", slot.n_prompt_tokens},
|
||||||
{"generation_settings", get_formated_generation(slot)},
|
{"generation_settings", get_formatted_generation(slot)},
|
||||||
{"prompt", slot.prompt},
|
{"prompt", slot.prompt},
|
||||||
{"truncated", slot.truncated},
|
{"truncated", slot.truncated},
|
||||||
{"stopped_eos", slot.stopped_eos},
|
{"stopped_eos", slot.stopped_eos},
|
||||||
|
@ -1373,7 +1394,7 @@ struct server_context {
|
||||||
{"stopped_limit", slot.stopped_limit},
|
{"stopped_limit", slot.stopped_limit},
|
||||||
{"stopping_word", slot.stopping_word},
|
{"stopping_word", slot.stopping_word},
|
||||||
{"tokens_cached", slot.n_past},
|
{"tokens_cached", slot.n_past},
|
||||||
{"timings", slot.get_formated_timings()}
|
{"timings", slot.get_formatted_timings()}
|
||||||
};
|
};
|
||||||
|
|
||||||
if (slot.sparams.n_probs > 0) {
|
if (slot.sparams.n_probs > 0) {
|
||||||
|
@ -1573,7 +1594,7 @@ struct server_context {
|
||||||
int n_processing_slots = 0;
|
int n_processing_slots = 0;
|
||||||
|
|
||||||
for (server_slot & slot : slots) {
|
for (server_slot & slot : slots) {
|
||||||
json slot_data = get_formated_generation(slot);
|
json slot_data = get_formatted_generation(slot);
|
||||||
slot_data["id"] = slot.id;
|
slot_data["id"] = slot.id;
|
||||||
slot_data["id_task"] = slot.id_task;
|
slot_data["id_task"] = slot.id_task;
|
||||||
slot_data["state"] = slot.state;
|
slot_data["state"] = slot.state;
|
||||||
|
@ -1775,6 +1796,9 @@ struct server_context {
|
||||||
if (slot.command == SLOT_COMMAND_RELEASE) {
|
if (slot.command == SLOT_COMMAND_RELEASE) {
|
||||||
slot.state = SLOT_STATE_IDLE;
|
slot.state = SLOT_STATE_IDLE;
|
||||||
slot.command = SLOT_COMMAND_NONE;
|
slot.command = SLOT_COMMAND_NONE;
|
||||||
|
if (n_draft > 0) {
|
||||||
|
llama_ngram_cache_merge(nc_dynamic, slot.nc_context);
|
||||||
|
}
|
||||||
slot.t_last_used = ggml_time_us();
|
slot.t_last_used = ggml_time_us();
|
||||||
|
|
||||||
LOG_INFO("slot released", {
|
LOG_INFO("slot released", {
|
||||||
|
@ -1846,6 +1870,9 @@ struct server_context {
|
||||||
|
|
||||||
llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
|
llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
|
||||||
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||||
|
for (int j = n_keep; j < slot.n_past - n_discard; ++j) {
|
||||||
|
slot.context_tokens[j] = slot.context_tokens[j + n_discard];
|
||||||
|
}
|
||||||
|
|
||||||
if (slot.params.cache_prompt) {
|
if (slot.params.cache_prompt) {
|
||||||
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
|
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
|
||||||
|
@ -1865,7 +1892,7 @@ struct server_context {
|
||||||
// start populating the batch for this iteration
|
// start populating the batch for this iteration
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
// frist, add sampled tokens from any ongoing sequences
|
// first, add sampled tokens from any ongoing sequences
|
||||||
for (auto & slot : slots) {
|
for (auto & slot : slots) {
|
||||||
if (slot.state == SLOT_STATE_IDLE) {
|
if (slot.state == SLOT_STATE_IDLE) {
|
||||||
continue;
|
continue;
|
||||||
|
@ -1878,6 +1905,12 @@ struct server_context {
|
||||||
// TODO: we always have to take into account the "system_tokens"
|
// TODO: we always have to take into account the "system_tokens"
|
||||||
// this is not great and needs to be improved somehow
|
// this is not great and needs to be improved somehow
|
||||||
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true);
|
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true);
|
||||||
|
slot.context_tokens[system_tokens.size() + slot_npast] = slot.sampled;
|
||||||
|
if (n_draft > 0) {
|
||||||
|
llama_ngram_cache_update(
|
||||||
|
slot.nc_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX,
|
||||||
|
slot.context_tokens.data(), system_tokens.size() + slot_npast, 1, false);
|
||||||
|
}
|
||||||
|
|
||||||
slot.n_past += 1;
|
slot.n_past += 1;
|
||||||
|
|
||||||
|
@ -1885,6 +1918,25 @@ struct server_context {
|
||||||
slot.cache_tokens.push_back(slot.sampled);
|
slot.cache_tokens.push_back(slot.sampled);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (slot.infill || slot.embedding) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int32_t max_draft = std::min(n_draft, slot.n_ctx - slot.n_past - 1);
|
||||||
|
if (max_draft <= 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
slot.draft.clear();
|
||||||
|
slot.draft.push_back(slot.context_tokens[slot.n_past - 1]);
|
||||||
|
llama_ngram_cache_draft(
|
||||||
|
slot.context_tokens.data(), slot.n_past, slot.draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, slot.nc_context, nc_dynamic, nc_static);
|
||||||
|
|
||||||
|
for (int j = 1; j < (int)slot.draft.size(); ++j) {
|
||||||
|
llama_batch_add(batch, slot.draft[j], system_tokens.size() + slot.n_past, {slot.id + 1}, true);
|
||||||
|
slot.n_past++;
|
||||||
|
}
|
||||||
|
|
||||||
LOG_VERBOSE("slot decode token", {
|
LOG_VERBOSE("slot decode token", {
|
||||||
{"id_slot", slot.id},
|
{"id_slot", slot.id},
|
||||||
{"id_task", slot.id_task},
|
{"id_task", slot.id_task},
|
||||||
|
@ -1905,7 +1957,7 @@ struct server_context {
|
||||||
for (auto & slot : slots) {
|
for (auto & slot : slots) {
|
||||||
// this slot still has a prompt to be processed
|
// this slot still has a prompt to be processed
|
||||||
if (slot.state == SLOT_STATE_IDLE && slot.command == SLOT_COMMAND_LOAD_PROMPT) {
|
if (slot.state == SLOT_STATE_IDLE && slot.command == SLOT_COMMAND_LOAD_PROMPT) {
|
||||||
auto & prompt_tokens = slot.prompt_tokens;
|
std::vector<llama_token> & prompt_tokens = slot.prompt_tokens;
|
||||||
|
|
||||||
// we haven't tokenized the prompt yet - do it now:
|
// we haven't tokenized the prompt yet - do it now:
|
||||||
if (prompt_tokens.empty()) {
|
if (prompt_tokens.empty()) {
|
||||||
|
@ -2107,6 +2159,11 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false);
|
llama_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false);
|
||||||
|
slot.context_tokens[system_tokens.size() + slot_npast] = prompt_tokens[slot.n_past];
|
||||||
|
if (n_draft > 0) {
|
||||||
|
llama_ngram_cache_update(
|
||||||
|
slot.nc_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, slot.context_tokens.data(), slot_npast, 1, false);
|
||||||
|
}
|
||||||
|
|
||||||
if (slot.params.cache_prompt) {
|
if (slot.params.cache_prompt) {
|
||||||
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
|
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
|
||||||
|
@ -2250,56 +2307,70 @@ struct server_context {
|
||||||
continue; // continue loop of slots
|
continue; // continue loop of slots
|
||||||
}
|
}
|
||||||
|
|
||||||
completion_token_output result;
|
int j = 0;
|
||||||
const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);
|
do { // while (j < std::min(n_batch, (int32_t)slot.draft.size()) && slot.sampled == slot.draft[j])
|
||||||
|
completion_token_output result;
|
||||||
|
const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i + j);
|
||||||
|
|
||||||
llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
|
llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
|
||||||
|
|
||||||
slot.n_decoded += 1;
|
slot.n_decoded += 1;
|
||||||
if (slot.n_decoded == 1) {
|
if (slot.n_decoded == 1) {
|
||||||
slot.t_start_generation = ggml_time_us();
|
slot.t_start_generation = ggml_time_us();
|
||||||
slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
|
slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
|
||||||
metrics.on_prompt_eval(slot);
|
metrics.on_prompt_eval(slot);
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
|
|
||||||
result.tok = id;
|
|
||||||
|
|
||||||
const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
|
|
||||||
if (n_probs > 0) {
|
|
||||||
const size_t n_valid = slot.ctx_sampling->n_valid;
|
|
||||||
|
|
||||||
// Make sure at least n_probs top tokens are at the front of the vector:
|
|
||||||
if (slot.sparams.temp == 0.0f && n_probs > n_valid) {
|
|
||||||
llama_sample_top_k(ctx, &cur_p, n_probs, 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slot.sparams.temp == 0.0f) {
|
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
|
||||||
// With greedy sampling the probabilities have possibly not been calculated.
|
result.tok = id;
|
||||||
for (size_t i = 0; i < n_probs; ++i) {
|
|
||||||
result.probs.push_back({
|
const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
|
||||||
cur_p.data[i].id,
|
if (n_probs > 0) {
|
||||||
i == 0 ? 1.0f : 0.0f
|
const size_t n_valid = slot.ctx_sampling->n_valid;
|
||||||
});
|
|
||||||
|
// Make sure at least n_probs top tokens are at the front of the vector:
|
||||||
|
if (slot.sparams.temp == 0.0f && n_probs > n_valid) {
|
||||||
|
llama_sample_top_k(ctx, &cur_p, n_probs, 0);
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
for (size_t i = 0; i < n_probs; ++i) {
|
if (slot.sparams.temp == 0.0f) {
|
||||||
result.probs.push_back({
|
// With greedy sampling the probabilities have possibly not been calculated.
|
||||||
cur_p.data[i].id,
|
for (size_t i = 0; i < n_probs; ++i) {
|
||||||
i >= n_valid ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
|
result.probs.push_back({
|
||||||
});
|
cur_p.data[i].id,
|
||||||
|
i == 0 ? 1.0f : 0.0f
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (size_t i = 0; i < n_probs; ++i) {
|
||||||
|
result.probs.push_back({
|
||||||
|
cur_p.data[i].id,
|
||||||
|
i >= n_valid ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (!process_token(result, slot)) {
|
++j;
|
||||||
slot.release();
|
|
||||||
slot.print_timings();
|
if (!process_token(result, slot)) {
|
||||||
send_final_response(slot);
|
slot.n_past -= slot.draft.size() - j;
|
||||||
metrics.on_prediction(slot);
|
llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1);
|
||||||
|
slot.draft.clear();
|
||||||
|
slot.release();
|
||||||
|
slot.print_timings();
|
||||||
|
send_final_response(slot);
|
||||||
|
metrics.on_prediction(slot);
|
||||||
|
}
|
||||||
|
} while (j < std::min(n_batch, (int32_t)slot.draft.size()) && slot.sampled == slot.draft[j]);
|
||||||
|
|
||||||
|
if (j < (int)slot.draft.size()) {
|
||||||
|
slot.n_past -= slot.draft.size() - j;
|
||||||
|
llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.i_batch = -1;
|
slot.i_batch = -1;
|
||||||
|
slot.draft.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2351,6 +2422,11 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
||||||
printf(" - distribute: spread execution evenly over all nodes\n");
|
printf(" - distribute: spread execution evenly over all nodes\n");
|
||||||
printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
|
printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
|
||||||
printf(" - numactl: use the CPU map provided my numactl\n");
|
printf(" - numactl: use the CPU map provided my numactl\n");
|
||||||
|
printf(" --draft N max. number of additional tokens to draft for n-gram lookup decoding (default: %d)\n", params.n_draft);
|
||||||
|
printf(" -lcs FNAME, --lookup-cache-static FNAME\n");
|
||||||
|
printf(" path to static lookup cache to use for n-gram lookup decoding (not updated by generation)\n");
|
||||||
|
printf(" -lcd FNAME, --lookup-cache-dynamic FNAME\n");
|
||||||
|
printf(" path to dynamic lookup cache to use for n-gram lookup decoding (updated by generation)\n");
|
||||||
if (llama_supports_gpu_offload()) {
|
if (llama_supports_gpu_offload()) {
|
||||||
printf(" -ngl N, --n-gpu-layers N\n");
|
printf(" -ngl N, --n-gpu-layers N\n");
|
||||||
printf(" number of layers to store in VRAM\n");
|
printf(" number of layers to store in VRAM\n");
|
||||||
|
@ -2753,6 +2829,24 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||||
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
|
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
|
||||||
else { invalid_param = true; break; }
|
else { invalid_param = true; break; }
|
||||||
}
|
}
|
||||||
|
} else if (arg == "-lcs" || arg == "--lookup-cache-static") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.lookup_cache_static = argv[i];
|
||||||
|
} else if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.lookup_cache_dynamic = argv[i];
|
||||||
|
} else if (arg == "--draft") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.n_draft = std::stoi(argv[i]);
|
||||||
} else if (arg == "--embedding" || arg == "--embeddings") {
|
} else if (arg == "--embedding" || arg == "--embeddings") {
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
} else if (arg == "-cb" || arg == "--cont-batching") {
|
} else if (arg == "-cb" || arg == "--cont-batching") {
|
||||||
|
@ -3027,6 +3121,23 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
LOG_INFO("model loaded", {});
|
LOG_INFO("model loaded", {});
|
||||||
|
|
||||||
|
ctx_server.n_draft = params.n_draft;
|
||||||
|
|
||||||
|
if (!params.lookup_cache_static.empty()) {
|
||||||
|
LOG_INFO("Loading static lookup cache from %s", {params.lookup_cache_static.c_str()});
|
||||||
|
if(!llama_ngram_cache_load(ctx_server.nc_static, params.lookup_cache_static)){
|
||||||
|
LOG_ERROR("Did not find a lookup cache under %s", {params.lookup_cache_static.c_str()});
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!params.lookup_cache_dynamic.empty()) {
|
||||||
|
LOG_INFO("Loading dynamic lookup cache from %s", {params.lookup_cache_dynamic.c_str()});
|
||||||
|
if(!llama_ngram_cache_load(ctx_server.nc_dynamic, params.lookup_cache_dynamic)){
|
||||||
|
LOG_INFO("Did not find a lookup cache under %s . It will be created on server shutdown.", {params.lookup_cache_dynamic.c_str()});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const auto model_meta = ctx_server.model_meta();
|
const auto model_meta = ctx_server.model_meta();
|
||||||
|
|
||||||
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
|
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
|
||||||
|
@ -3827,6 +3938,11 @@ int main(int argc, char ** argv) {
|
||||||
svr->stop();
|
svr->stop();
|
||||||
t.join();
|
t.join();
|
||||||
|
|
||||||
|
if (!params.lookup_cache_dynamic.empty()) {
|
||||||
|
LOG_INFO("Saving dynamic lookup cache to %s", {params.lookup_cache_dynamic.c_str()});
|
||||||
|
llama_ngram_cache_save(ctx_server.nc_dynamic, params.lookup_cache_dynamic);
|
||||||
|
}
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -51,7 +51,6 @@ Feature: Results
|
||||||
Scenario Outline: consistent results with same seed and varying batch size
|
Scenario Outline: consistent results with same seed and varying batch size
|
||||||
Given 4 slots
|
Given 4 slots
|
||||||
And <temp> temperature
|
And <temp> temperature
|
||||||
# And 0 as draft
|
|
||||||
Then the server is starting
|
Then the server is starting
|
||||||
Then the server is healthy
|
Then the server is healthy
|
||||||
|
|
||||||
|
@ -79,3 +78,32 @@ Feature: Results
|
||||||
# and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 .
|
# and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 .
|
||||||
# | 2 | 1.0 |
|
# | 2 | 1.0 |
|
||||||
# | 4 | 1.0 |
|
# | 4 | 1.0 |
|
||||||
|
|
||||||
|
Scenario Outline: consistent results with same seed and varying n_draft
|
||||||
|
Given 0.0 temperature
|
||||||
|
Given <n_slots> slots
|
||||||
|
Given 0 as draft
|
||||||
|
Then the server is starting
|
||||||
|
Then the server is healthy
|
||||||
|
|
||||||
|
Given 4 prompts "Write a very long story about AI." with seed 42
|
||||||
|
And concurrent completion requests
|
||||||
|
Then the server is busy
|
||||||
|
Then the server is idle
|
||||||
|
And all slots are idle
|
||||||
|
|
||||||
|
Given 3 as draft
|
||||||
|
Then the server is starting
|
||||||
|
Then the server is healthy
|
||||||
|
|
||||||
|
Given 4 prompts "Write a very long story about AI." with seed 42
|
||||||
|
And concurrent completion requests
|
||||||
|
Then the server is busy
|
||||||
|
Then the server is idle
|
||||||
|
And all slots are idle
|
||||||
|
|
||||||
|
Then all predictions are equal
|
||||||
|
Examples:
|
||||||
|
| n_slots |
|
||||||
|
| 1 |
|
||||||
|
| 2 |
|
||||||
|
|
|
@ -3,6 +3,7 @@ import collections
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import signal
|
||||||
import socket
|
import socket
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
@ -873,6 +874,8 @@ async def request_completion(prompt,
|
||||||
headers=headers,
|
headers=headers,
|
||||||
timeout=3600) as response:
|
timeout=3600) as response:
|
||||||
if expect_api_error is None or not expect_api_error:
|
if expect_api_error is None or not expect_api_error:
|
||||||
|
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON' and response.status != 200:
|
||||||
|
print(f"Unexpected bad HTTP response: {response.status}")
|
||||||
assert response.status == 200
|
assert response.status == 200
|
||||||
assert response.headers['Access-Control-Allow-Origin'] == origin
|
assert response.headers['Access-Control-Allow-Origin'] == origin
|
||||||
return await response.json()
|
return await response.json()
|
||||||
|
@ -1263,8 +1266,7 @@ def start_server_background(context):
|
||||||
server_args.extend(['--ubatch-size', context.n_ubatch])
|
server_args.extend(['--ubatch-size', context.n_ubatch])
|
||||||
if context.n_gpu_layer:
|
if context.n_gpu_layer:
|
||||||
server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
|
server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
|
||||||
if context.draft is not None:
|
server_args.extend(['--draft', context.draft if context.draft is not None else 0])
|
||||||
server_args.extend(['--draft', context.draft])
|
|
||||||
if context.server_continuous_batching:
|
if context.server_continuous_batching:
|
||||||
server_args.append('--cont-batching')
|
server_args.append('--cont-batching')
|
||||||
if context.server_embeddings:
|
if context.server_embeddings:
|
||||||
|
@ -1306,6 +1308,14 @@ def start_server_background(context):
|
||||||
'stdout': subprocess.PIPE,
|
'stdout': subprocess.PIPE,
|
||||||
'stderr': subprocess.PIPE
|
'stderr': subprocess.PIPE
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if context.server_process is not None:
|
||||||
|
if os.name == 'nt':
|
||||||
|
interrupt = signal.CTRL_C_EVENT
|
||||||
|
else:
|
||||||
|
interrupt = signal.SIGINT
|
||||||
|
context.server_process.send_signal(interrupt)
|
||||||
|
|
||||||
context.server_process = subprocess.Popen(
|
context.server_process = subprocess.Popen(
|
||||||
[str(arg) for arg in [context.server_path, *server_args]],
|
[str(arg) for arg in [context.server_path, *server_args]],
|
||||||
**pkwargs)
|
**pkwargs)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue