Merge branch 'master' into compilade/refactor-kv-cache
This commit is contained in:
commit
9b38f8bf65
102 changed files with 4745 additions and 1675 deletions
|
@ -93,14 +93,34 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// create a llama_batch
|
||||
// we use this object to submit token data for decoding
|
||||
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1);
|
||||
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
|
||||
|
||||
std::vector<llama_seq_id> seq_ids(n_parallel, 0);
|
||||
for (int32_t i = 0; i < n_parallel; ++i) {
|
||||
seq_ids[i] = i;
|
||||
}
|
||||
|
||||
// evaluate the initial prompt
|
||||
for (size_t i = 0; i < tokens_list.size(); ++i) {
|
||||
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
|
||||
llama_batch_add(batch, tokens_list[i], i, seq_ids, false);
|
||||
}
|
||||
GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
|
||||
|
||||
if (llama_model_has_encoder(model)) {
|
||||
if (llama_encode(ctx, batch)) {
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||
if (decoder_start_token_id == -1) {
|
||||
decoder_start_token_id = llama_token_bos(model);
|
||||
}
|
||||
|
||||
llama_batch_clear(batch);
|
||||
llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
|
||||
}
|
||||
|
||||
// llama_decode will output logits only for the last token of the prompt
|
||||
batch.logits[batch.n_tokens - 1] = true;
|
||||
|
||||
|
@ -109,11 +129,11 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
// assign the system KV cache to all parallel sequences
|
||||
// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
|
||||
for (int32_t i = 1; i < n_parallel; ++i) {
|
||||
llama_past_seq_cp(ctx, 0, i, -1, -1);
|
||||
}
|
||||
//// assign the system KV cache to all parallel sequences
|
||||
//// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
|
||||
//for (int32_t i = 1; i < n_parallel; ++i) {
|
||||
// llama_past_seq_cp(ctx, 0, i, -1, -1);
|
||||
//}
|
||||
|
||||
if (n_parallel > 1) {
|
||||
LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
|
||||
|
|
|
@ -58,4 +58,3 @@ The above command will output space-separated float values.
|
|||
```powershell
|
||||
embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
||||
```
|
||||
|
||||
|
|
|
@ -659,4 +659,3 @@ int main(int argc, char ** argv) {
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
-r ../../requirements/requirements-convert-legacy-llama.txt
|
||||
pillow~=10.2.0
|
||||
torch~=2.1.1
|
||||
torch~=2.2.1
|
||||
|
|
|
@ -10,4 +10,3 @@ More info:
|
|||
|
||||
https://github.com/ggerganov/llama.cpp/pull/4484
|
||||
https://github.com/ggerganov/llama.cpp/issues/4226
|
||||
|
||||
|
|
1
examples/main-cmake-pkg/.gitignore
vendored
1
examples/main-cmake-pkg/.gitignore
vendored
|
@ -48,4 +48,3 @@
|
|||
build*/
|
||||
out/
|
||||
tmp/
|
||||
|
||||
|
|
|
@ -30,4 +30,3 @@ target_include_directories(${TARGET} PRIVATE ${_common_path})
|
|||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
|
|
|
@ -255,7 +255,9 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||
if (!llama_model_has_encoder(model)) {
|
||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||
}
|
||||
LOG("add_bos: %d\n", add_bos);
|
||||
|
||||
std::vector<llama_token> embd_inp;
|
||||
|
@ -520,6 +522,24 @@ int main(int argc, char ** argv) {
|
|||
exit(1);
|
||||
}
|
||||
|
||||
if (llama_model_has_encoder(model)) {
|
||||
int enc_input_size = embd_inp.size();
|
||||
llama_token * enc_input_buf = embd_inp.data();
|
||||
|
||||
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||
if (decoder_start_token_id == -1) {
|
||||
decoder_start_token_id = llama_token_bos(model);
|
||||
}
|
||||
|
||||
embd_inp.clear();
|
||||
embd_inp.push_back(decoder_start_token_id);
|
||||
}
|
||||
|
||||
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
||||
// predict
|
||||
if (!embd.empty()) {
|
||||
|
|
|
@ -1991,6 +1991,12 @@ int main(int argc, char ** argv) {
|
|||
params.n_batch = std::min(params.n_batch, n_kv);
|
||||
} else {
|
||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||
if (params.kl_divergence) {
|
||||
params.n_parallel = 1;
|
||||
} else {
|
||||
// ensure there's at least enough seq_ids for HellaSwag
|
||||
params.n_parallel = std::max(4, params.n_parallel);
|
||||
}
|
||||
}
|
||||
|
||||
if (params.ppl_stride > 0) {
|
||||
|
@ -2015,9 +2021,6 @@ int main(int argc, char ** argv) {
|
|||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
// ensure there's at least enough seq_ids for HellaSwag
|
||||
params.n_parallel = std::max(4, params.n_parallel);
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
if (model == NULL) {
|
||||
|
|
|
@ -31,4 +31,3 @@ for i in range(n-1):
|
|||
embedding2 = np.array(result[j])
|
||||
similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
|
||||
print(f"Similarity between {i} and {j}: {similarity:.2f}")
|
||||
|
||||
|
|
|
@ -52,4 +52,3 @@ Feature: Passkey / Self-extend with context shift
|
|||
#| TheBloke/Llama-2-7B-GGUF | llama-2-7b.Q2_K.gguf | 4096 | 3 | 16384 | 512 | 4 | 512 | 500 | 300 | 1234 | 5 | 1234 |
|
||||
#| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768 | 2 | 16384 | 512 | 4 | 512 | 500 | 100 | 0987 | 5 | 0
|
||||
# 987 |
|
||||
|
||||
|
|
|
@ -1054,4 +1054,3 @@
|
|||
</body>
|
||||
|
||||
</html>
|
||||
|
||||
|
|
|
@ -1058,4 +1058,3 @@
|
|||
</body>
|
||||
|
||||
</html>
|
||||
|
||||
|
|
|
@ -34,4 +34,3 @@ fi
|
|||
|
||||
#use multiple GPUs with same max compute units
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
|
||||
|
|
|
@ -31,4 +31,3 @@ exit /B 0
|
|||
:ERROR
|
||||
echo comomand error: %errorlevel%
|
||||
exit /B %errorlevel%
|
||||
|
||||
|
|
|
@ -7,5 +7,3 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
|||
|
||||
|
||||
.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
|
||||
|
||||
|
||||
|
|
|
@ -30,6 +30,7 @@ static void print_usage_information(const char * argv0, FILE * stream) {
|
|||
fprintf(stream, " --stdin read prompt from standard input.\n");
|
||||
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
||||
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
||||
fprintf(stream, " --show-count print the total number of tokens.\n");
|
||||
}
|
||||
|
||||
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
|
||||
|
@ -195,6 +196,7 @@ int main(int raw_argc, char ** raw_argv) {
|
|||
bool printing_ids = false;
|
||||
bool no_bos = false;
|
||||
bool disable_logging = false;
|
||||
bool show_token_count = false;
|
||||
const char * model_path = NULL;
|
||||
const char * prompt_path = NULL;
|
||||
const char * prompt_arg = NULL;
|
||||
|
@ -249,6 +251,9 @@ int main(int raw_argc, char ** raw_argv) {
|
|||
else if (arg == "--log-disable") {
|
||||
disable_logging = true;
|
||||
}
|
||||
else if (arg == "--show-count") {
|
||||
show_token_count = true;
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
|
||||
return 1;
|
||||
|
@ -384,6 +389,9 @@ int main(int raw_argc, char ** raw_argv) {
|
|||
printf("]\n");
|
||||
}
|
||||
|
||||
if (show_token_count) {
|
||||
printf("Total number of tokens: %ld\n", tokens.size());
|
||||
}
|
||||
// silence valgrind
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue