Merge commit 'e16b9fa4ba
' into nomic-vulkan
This commit is contained in:
commit
fe26e6adff
48 changed files with 6286 additions and 5841 deletions
|
@ -154,6 +154,10 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
|
||||
LOG_TEE("\n");
|
||||
|
||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
||||
|
||||
|
@ -181,7 +185,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
const auto t_pp_start = ggml_time_us();
|
||||
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
|
|
|
@ -11,7 +11,7 @@ int main(int argc, char ** argv) {
|
|||
gpt_params params;
|
||||
|
||||
if (argc == 1 || argv[1][0] == '-') {
|
||||
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN]\n" , argv[0]);
|
||||
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
|
||||
return 1 ;
|
||||
}
|
||||
|
||||
|
@ -21,6 +21,9 @@ int main(int argc, char ** argv) {
|
|||
// total length of the sequences including the prompt
|
||||
int n_len = 32;
|
||||
|
||||
// number of layers to offload to the GPU
|
||||
int n_gpu_layers = 0;
|
||||
|
||||
if (argc >= 2) {
|
||||
params.model = argv[1];
|
||||
}
|
||||
|
@ -37,6 +40,10 @@ int main(int argc, char ** argv) {
|
|||
n_len = std::atoi(argv[4]);
|
||||
}
|
||||
|
||||
if (argc >= 6) {
|
||||
n_gpu_layers = std::atoi(argv[5]);
|
||||
}
|
||||
|
||||
if (params.prompt.empty()) {
|
||||
params.prompt = "Hello my name is";
|
||||
}
|
||||
|
@ -49,7 +56,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
// model_params.n_gpu_layers = 99; // offload all layers to the GPU
|
||||
model_params.n_gpu_layers = n_gpu_layers;
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||
|
||||
|
|
|
@ -652,7 +652,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
|||
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
|
||||
|
||||
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
|
||||
if (ggml_is_quantized(a->type)) {
|
||||
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
|
||||
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
|
||||
} else if (a->type == GGML_TYPE_F32) {
|
||||
return ggml_add(ctx, a, b);
|
||||
|
@ -1459,6 +1459,17 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
|
|||
}
|
||||
params->n_rank_w3 = std::stoi(argv[i]);
|
||||
params->custom_n_rank_w3 = true;
|
||||
} else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
params->common.n_gpu_layers = std::stoi(argv[i]);
|
||||
#else
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
#endif
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
train_print_usage(argc, argv, &default_params);
|
||||
|
@ -1545,6 +1556,7 @@ int main(int argc, char ** argv) {
|
|||
srand(params.common.seed);
|
||||
|
||||
struct llama_model_params llama_mparams = llama_model_default_params();
|
||||
llama_mparams.n_gpu_layers = params.common.n_gpu_layers;
|
||||
llama_mparams.vocab_only = false;
|
||||
|
||||
printf("%s: model base = '%s'\n", __func__, params.fn_model_base);
|
||||
|
|
34
examples/finetune/finetune.sh
Normal file
34
examples/finetune/finetune.sh
Normal file
|
@ -0,0 +1,34 @@
|
|||
#!/bin/bash
|
||||
cd `dirname $0`
|
||||
cd ../..
|
||||
|
||||
EXE="./finetune"
|
||||
|
||||
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
|
||||
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
|
||||
|
||||
# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
|
||||
MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
|
||||
|
||||
while getopts "dg" opt; do
|
||||
case $opt in
|
||||
d)
|
||||
DEBUGGER="gdb --args"
|
||||
;;
|
||||
g)
|
||||
EXE="./build/bin/Release/finetune"
|
||||
GPUARG="--gpu-layers 25"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
$DEBUGGER $EXE \
|
||||
--model-base $MODEL \
|
||||
$GPUARG \
|
||||
--checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \
|
||||
--checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
|
||||
--lora-out lora-ol3b-shakespeare-ITERATION.bin \
|
||||
--train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
|
||||
--save-every 10 \
|
||||
--threads 10 --adam-iter 30 --batch 4 --ctx 64 \
|
||||
--use-checkpointing
|
|
@ -1037,7 +1037,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
test t(inst, lmodel, ctx);
|
||||
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
// warmup run
|
||||
if (t.n_prompt > 0) {
|
||||
|
@ -1048,7 +1048,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
for (int i = 0; i < params.reps; i++) {
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
uint64_t t_start = get_time_ns();
|
||||
if (t.n_prompt > 0) {
|
||||
|
|
|
@ -16,6 +16,8 @@ add_library(common OBJECT
|
|||
${_common_path}/console.cpp
|
||||
${_common_path}/grammar-parser.h
|
||||
${_common_path}/grammar-parser.cpp
|
||||
${_common_path}/sampling.h
|
||||
${_common_path}/sampling.cpp
|
||||
)
|
||||
|
||||
# WARNING: because build-info.h is auto-generated, it will only
|
||||
|
|
|
@ -208,6 +208,14 @@ Top-p sampling, also known as nucleus sampling, is another text generation metho
|
|||
|
||||
Example usage: `--top-p 0.95`
|
||||
|
||||
### Min P Sampling
|
||||
|
||||
- `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.05).
|
||||
|
||||
The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.
|
||||
|
||||
Example usage: `--min-p 0.05`
|
||||
|
||||
### Tail Free Sampling (TFS)
|
||||
|
||||
- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
|
||||
|
|
|
@ -306,7 +306,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// remove any "future" tokens that we might have inherited from the previous session
|
||||
llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
|
||||
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
||||
}
|
||||
|
||||
LOGLN(
|
||||
|
|
|
@ -210,7 +210,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
for (int j = 0; j < num_batches; ++j) {
|
||||
const int batch_start = start + j * n_batch;
|
||||
|
@ -339,7 +339,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
for (int j = 0; j < num_batches; ++j) {
|
||||
const int batch_start = start + j * n_batch;
|
||||
|
@ -573,7 +573,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||
}
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
|
||||
if (logits.empty()) {
|
||||
|
|
|
@ -18,7 +18,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
||||
|
@ -31,7 +30,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
|
||||
#endif
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
|
@ -70,13 +68,14 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
|||
}
|
||||
|
||||
// usage:
|
||||
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
||||
// ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
||||
//
|
||||
[[noreturn]]
|
||||
static void usage(const char * executable) {
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
|
||||
printf("\nAllowed quantization types:\n");
|
||||
for (auto & it : QUANT_OPTIONS) {
|
||||
if (it.name != "COPY") {
|
||||
|
@ -103,6 +102,8 @@ int main(int argc, char ** argv) {
|
|||
params.quantize_output_tensor = false;
|
||||
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
||||
params.allow_requantize = true;
|
||||
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
|
||||
params.pure = true;
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
|
|
|
@ -149,6 +149,7 @@ struct task_server {
|
|||
task_type type;
|
||||
json data;
|
||||
bool infill_mode = false;
|
||||
bool embedding_mode = false;
|
||||
};
|
||||
|
||||
struct task_result {
|
||||
|
@ -371,6 +372,7 @@ struct llama_client_slot
|
|||
std::vector<completion_token_output> generated_token_probs;
|
||||
|
||||
bool infill = false;
|
||||
bool embedding = false;
|
||||
bool has_next_token = true;
|
||||
bool truncated = false;
|
||||
bool stopped_eos = false;
|
||||
|
@ -454,7 +456,7 @@ struct llama_client_slot
|
|||
}
|
||||
|
||||
void release() {
|
||||
if (state == PROCESSING)
|
||||
if (state == IDLE || state == PROCESSING)
|
||||
{
|
||||
t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
|
||||
command = RELEASE;
|
||||
|
@ -754,6 +756,7 @@ struct llama_server_context
|
|||
}
|
||||
|
||||
slot->params.antiprompt.clear();
|
||||
|
||||
const auto &stop = data.find("stop");
|
||||
if (stop != data.end() && stop->is_array())
|
||||
{
|
||||
|
@ -856,7 +859,7 @@ struct llama_server_context
|
|||
|
||||
void kv_cache_clear() {
|
||||
// clear the entire KV cache
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
clean_kv_cache = false;
|
||||
}
|
||||
|
||||
|
@ -867,7 +870,7 @@ struct llama_server_context
|
|||
|
||||
kv_cache_clear();
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i)
|
||||
for (int i = 0; i < (int) system_tokens.size(); ++i)
|
||||
{
|
||||
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||
}
|
||||
|
@ -894,16 +897,8 @@ struct llama_server_context
|
|||
{
|
||||
slot.release();
|
||||
}
|
||||
wait_all_are_idle();
|
||||
all_slots_are_idle = true;
|
||||
|
||||
// wait until system prompt load
|
||||
system_need_update = true;
|
||||
while (system_need_update)
|
||||
{
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(5));
|
||||
}
|
||||
// system prompt loaded, continue
|
||||
}
|
||||
|
||||
void process_system_prompt_data(const json &sys_props) {
|
||||
|
@ -915,26 +910,6 @@ struct llama_server_context
|
|||
{
|
||||
notify_system_prompt_changed();
|
||||
}
|
||||
else
|
||||
{
|
||||
system_need_update = true;
|
||||
}
|
||||
}
|
||||
|
||||
void wait_all_are_idle() {
|
||||
bool wait = true;
|
||||
while (wait)
|
||||
{
|
||||
wait = false;
|
||||
for (auto &slot : slots)
|
||||
{
|
||||
if (!slot.available())
|
||||
{
|
||||
wait = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
|
||||
|
@ -965,7 +940,6 @@ struct llama_server_context
|
|||
slot.has_next_token = false;
|
||||
}
|
||||
stop_pos = pos;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1272,13 +1246,14 @@ struct llama_server_context
|
|||
queue_results.push_back(res);
|
||||
}
|
||||
|
||||
int request_completion(json data, bool infill)
|
||||
int request_completion(json data, bool infill, bool embedding)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||
task_server task;
|
||||
task.id = id_gen++;
|
||||
task.data = data;
|
||||
task.infill_mode = infill;
|
||||
task.embedding_mode = embedding;
|
||||
task.type = COMPLETION_TASK;
|
||||
queue_tasks.push_back(task);
|
||||
return task.id;
|
||||
|
@ -1404,7 +1379,7 @@ struct llama_server_context
|
|||
{
|
||||
LOG_TEE("slot unavailable\n");
|
||||
// send error result
|
||||
send_error(task.id, "slot unavaliable");
|
||||
send_error(task.id, "slot unavailable");
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1416,6 +1391,7 @@ struct llama_server_context
|
|||
slot->reset();
|
||||
|
||||
slot->infill = task.infill_mode;
|
||||
slot->embedding = task.embedding_mode;
|
||||
slot->task_id = task.id;
|
||||
|
||||
if (!launch_slot_with_data(slot, task.data))
|
||||
|
@ -1444,7 +1420,7 @@ struct llama_server_context
|
|||
process_tasks();
|
||||
|
||||
// update the system prompt wait until all slots are idle state
|
||||
if (system_need_update)
|
||||
if (system_need_update && all_slots_are_idle)
|
||||
{
|
||||
LOG_TEE("updating system prompt\n");
|
||||
update_system_prompt();
|
||||
|
@ -1498,7 +1474,7 @@ struct llama_server_context
|
|||
for (auto & slot : slots)
|
||||
{
|
||||
// release the slot
|
||||
if (slot.state == PROCESSING && slot.command == RELEASE)
|
||||
if (slot.command == RELEASE)
|
||||
{
|
||||
slot.state = IDLE;
|
||||
slot.command = NONE;
|
||||
|
@ -1509,7 +1485,7 @@ struct llama_server_context
|
|||
continue;
|
||||
}
|
||||
|
||||
if (slot.state == IDLE || slot.command == RELEASE)
|
||||
if (slot.state == IDLE)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
@ -1530,6 +1506,17 @@ struct llama_server_context
|
|||
{
|
||||
for (auto & slot : slots)
|
||||
{
|
||||
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
|
||||
|
||||
// empty prompt passed -> release the slot and send empty response
|
||||
if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
|
||||
{
|
||||
slot.release();
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
continue;
|
||||
}
|
||||
|
||||
// need process the prompt
|
||||
if (slot.state == IDLE && slot.command == LOAD_PROMPT)
|
||||
{
|
||||
|
@ -1712,7 +1699,7 @@ struct llama_server_context
|
|||
}
|
||||
|
||||
// prompt evaluated for embedding
|
||||
if (params.embedding)
|
||||
if (slot.embedding)
|
||||
{
|
||||
send_embedding(slot);
|
||||
slot.release();
|
||||
|
@ -1749,8 +1736,8 @@ struct llama_server_context
|
|||
if (!process_token(result, slot))
|
||||
{
|
||||
slot.release();
|
||||
send_final_response(slot);
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
}
|
||||
|
||||
slot.i_batch = -1;
|
||||
|
@ -1766,15 +1753,16 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||
printf("usage: %s [options]\n", argv0);
|
||||
printf("\n");
|
||||
printf("options:\n");
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
||||
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
||||
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
if (llama_mlock_supported())
|
||||
{
|
||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
|
@ -1924,6 +1912,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
}
|
||||
params.n_threads = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "--threads-batch" || arg == "-tb")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_threads_batch = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "-b" || arg == "--batch-size")
|
||||
{
|
||||
if (++i >= argc)
|
||||
|
@ -2281,11 +2278,11 @@ int main(int argc, char **argv)
|
|||
svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||
{
|
||||
json data = json::parse(req.body);
|
||||
const int task_id = llama.request_completion(data, false);
|
||||
const int task_id = llama.request_completion(data, false, false);
|
||||
if (!json_value(data, "stream", false)) {
|
||||
std::string completion_text;
|
||||
task_result result = llama.next_result(task_id);
|
||||
if(!result.error && result.stop) {
|
||||
if (!result.error && result.stop) {
|
||||
res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
|
||||
}
|
||||
else
|
||||
|
@ -2312,7 +2309,7 @@ int main(int argc, char **argv)
|
|||
{
|
||||
return false;
|
||||
}
|
||||
if(result.stop) {
|
||||
if (result.stop) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
|
@ -2336,7 +2333,7 @@ int main(int argc, char **argv)
|
|||
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||
{
|
||||
json data = json::parse(req.body);
|
||||
const int task_id = llama.request_completion(data, true);
|
||||
const int task_id = llama.request_completion(data, true, false);
|
||||
if (!json_value(data, "stream", false)) {
|
||||
std::string completion_text;
|
||||
task_result result = llama.next_result(task_id);
|
||||
|
@ -2440,7 +2437,7 @@ int main(int argc, char **argv)
|
|||
{
|
||||
prompt = "";
|
||||
}
|
||||
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false);
|
||||
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true);
|
||||
task_result result = llama.next_result(task_id);
|
||||
return res.set_content(result.result_json.dump(), "application/json");
|
||||
});
|
||||
|
|
|
@ -95,13 +95,8 @@ int main(int argc, char ** argv) {
|
|||
llama_batch batch = llama_batch_init(512, 0, 1);
|
||||
|
||||
// evaluate the initial prompt
|
||||
batch.n_tokens = tokens_list.size();
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
||||
batch.token[i] = tokens_list[i];
|
||||
batch.pos[i] = i;
|
||||
batch.seq_id[i] = 0;
|
||||
batch.logits[i] = false;
|
||||
for (size_t i = 0; i < tokens_list.size(); i++) {
|
||||
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
|
||||
}
|
||||
|
||||
// llama_decode will output logits only for the last token of the prompt
|
||||
|
@ -148,15 +143,10 @@ int main(int argc, char ** argv) {
|
|||
fflush(stdout);
|
||||
|
||||
// prepare the next batch
|
||||
batch.n_tokens = 0;
|
||||
llama_batch_clear(batch);
|
||||
|
||||
// push this new token for next evaluation
|
||||
batch.token [batch.n_tokens] = new_token_id;
|
||||
batch.pos [batch.n_tokens] = n_cur;
|
||||
batch.seq_id[batch.n_tokens] = 0;
|
||||
batch.logits[batch.n_tokens] = true;
|
||||
|
||||
batch.n_tokens += 1;
|
||||
llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
|
||||
|
||||
n_decode += 1;
|
||||
}
|
||||
|
|
|
@ -8,6 +8,9 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||
|
||||
struct seq_draft {
|
||||
bool active = false;
|
||||
bool drafting = false;
|
||||
|
@ -64,6 +67,33 @@ int main(int argc, char ** argv) {
|
|||
params.n_gpu_layers = params.n_gpu_layers_draft;
|
||||
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
||||
|
||||
{
|
||||
const int n_vocab_tgt = llama_n_vocab(model_tgt);
|
||||
const int n_vocab_dft = llama_n_vocab(model_dft);
|
||||
const int vocab_diff = n_vocab_tgt > n_vocab_dft
|
||||
? n_vocab_tgt - n_vocab_dft
|
||||
: n_vocab_dft - n_vocab_tgt;
|
||||
|
||||
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
||||
fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
|
||||
fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
||||
n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
|
||||
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
|
||||
const char * token_text_dft = llama_token_get_text(model_dft, i);
|
||||
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
||||
fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
|
||||
fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
|
||||
llama_token_to_piece(ctx_tgt, i).c_str(),
|
||||
llama_token_to_piece(ctx_dft, i).c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
|
||||
|
@ -118,7 +148,7 @@ int main(int argc, char ** argv) {
|
|||
std::vector<seq_draft> drafts(n_seq_dft);
|
||||
|
||||
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
|
||||
params.sparams.temp = std::max(0.01f, params.sparams.temp);
|
||||
params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
|
||||
|
||||
for (int s = 0; s < n_seq_dft; ++s) {
|
||||
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
|
||||
|
@ -227,6 +257,7 @@ int main(int argc, char ** argv) {
|
|||
llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
||||
// LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
||||
llama_decode (ctx_dft, batch_dft);
|
||||
|
||||
++n_past_dft;
|
||||
|
@ -370,7 +401,7 @@ int main(int argc, char ** argv) {
|
|||
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
|
||||
}
|
||||
|
||||
//LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt));
|
||||
// LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
|
||||
llama_decode(ctx_tgt, batch_tgt);
|
||||
++n_past_tgt;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue