Merge branch 'master' into concedo_experimental
# Conflicts: # flake.lock # flake.nix
This commit is contained in:
commit
9342636408
11 changed files with 1933 additions and 2199 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -4,6 +4,7 @@
|
||||||
.DS_Store
|
.DS_Store
|
||||||
.build/
|
.build/
|
||||||
.cache/
|
.cache/
|
||||||
|
.ccls-cache/
|
||||||
.direnv/
|
.direnv/
|
||||||
.envrc
|
.envrc
|
||||||
.swiftpm
|
.swiftpm
|
||||||
|
|
|
@ -218,6 +218,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
sparams.top_p = std::stof(argv[i]);
|
sparams.top_p = std::stof(argv[i]);
|
||||||
|
} else if (arg == "--min-p") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
sparams.min_p = std::stof(argv[i]);
|
||||||
} else if (arg == "--temp") {
|
} else if (arg == "--temp") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -679,6 +685,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
||||||
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
||||||
|
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
|
||||||
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
|
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
|
||||||
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
|
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
|
||||||
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
|
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
|
||||||
|
@ -1275,6 +1282,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
||||||
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
||||||
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
||||||
|
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
||||||
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
||||||
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
||||||
}
|
}
|
||||||
|
|
|
@ -89,10 +89,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
|
||||||
|
|
||||||
snprintf(result, sizeof(result),
|
snprintf(result, sizeof(result),
|
||||||
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
||||||
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
||||||
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
||||||
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
|
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
|
||||||
params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp,
|
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
|
||||||
params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
||||||
|
|
||||||
return std::string(result);
|
return std::string(result);
|
||||||
|
@ -110,6 +110,7 @@ llama_token llama_sampling_sample(
|
||||||
const float temp = params.temp;
|
const float temp = params.temp;
|
||||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||||
const float top_p = params.top_p;
|
const float top_p = params.top_p;
|
||||||
|
const float min_p = params.min_p;
|
||||||
const float tfs_z = params.tfs_z;
|
const float tfs_z = params.tfs_z;
|
||||||
const float typical_p = params.typical_p;
|
const float typical_p = params.typical_p;
|
||||||
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
|
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
|
||||||
|
@ -190,6 +191,7 @@ llama_token llama_sampling_sample(
|
||||||
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
|
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
|
||||||
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
|
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
|
||||||
llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep);
|
llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep);
|
||||||
|
llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep);
|
||||||
llama_sample_temp (ctx_main, &cur_p, temp);
|
llama_sample_temp (ctx_main, &cur_p, temp);
|
||||||
|
|
||||||
id = llama_sample_token(ctx_main, &cur_p);
|
id = llama_sample_token(ctx_main, &cur_p);
|
||||||
|
|
|
@ -14,6 +14,7 @@ typedef struct llama_sampling_params {
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
int32_t top_k = 40; // <= 0 to use vocab size
|
int32_t top_k = 40; // <= 0 to use vocab size
|
||||||
float top_p = 0.95f; // 1.0 = disabled
|
float top_p = 0.95f; // 1.0 = disabled
|
||||||
|
float min_p = 0.05f; // 0.0 = disabled
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
float typical_p = 1.00f; // 1.0 = disabled
|
float typical_p = 1.00f; // 1.0 = disabled
|
||||||
float temp = 0.80f; // 1.0 = disabled
|
float temp = 0.80f; // 1.0 = disabled
|
||||||
|
|
|
@ -208,6 +208,14 @@ Top-p sampling, also known as nucleus sampling, is another text generation metho
|
||||||
|
|
||||||
Example usage: `--top-p 0.95`
|
Example usage: `--top-p 0.95`
|
||||||
|
|
||||||
|
### Min P Sampling
|
||||||
|
|
||||||
|
- `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.05).
|
||||||
|
|
||||||
|
The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.
|
||||||
|
|
||||||
|
Example usage: `--min-p 0.05`
|
||||||
|
|
||||||
### Tail Free Sampling (TFS)
|
### Tail Free Sampling (TFS)
|
||||||
|
|
||||||
- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
|
- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
|
||||||
|
|
|
@ -149,6 +149,7 @@ struct task_server {
|
||||||
task_type type;
|
task_type type;
|
||||||
json data;
|
json data;
|
||||||
bool infill_mode = false;
|
bool infill_mode = false;
|
||||||
|
bool embedding_mode = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct task_result {
|
struct task_result {
|
||||||
|
@ -371,6 +372,7 @@ struct llama_client_slot
|
||||||
std::vector<completion_token_output> generated_token_probs;
|
std::vector<completion_token_output> generated_token_probs;
|
||||||
|
|
||||||
bool infill = false;
|
bool infill = false;
|
||||||
|
bool embedding = false;
|
||||||
bool has_next_token = true;
|
bool has_next_token = true;
|
||||||
bool truncated = false;
|
bool truncated = false;
|
||||||
bool stopped_eos = false;
|
bool stopped_eos = false;
|
||||||
|
@ -1244,13 +1246,14 @@ struct llama_server_context
|
||||||
queue_results.push_back(res);
|
queue_results.push_back(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
int request_completion(json data, bool infill)
|
int request_completion(json data, bool infill, bool embedding)
|
||||||
{
|
{
|
||||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||||
task_server task;
|
task_server task;
|
||||||
task.id = id_gen++;
|
task.id = id_gen++;
|
||||||
task.data = data;
|
task.data = data;
|
||||||
task.infill_mode = infill;
|
task.infill_mode = infill;
|
||||||
|
task.embedding_mode = embedding;
|
||||||
task.type = COMPLETION_TASK;
|
task.type = COMPLETION_TASK;
|
||||||
queue_tasks.push_back(task);
|
queue_tasks.push_back(task);
|
||||||
return task.id;
|
return task.id;
|
||||||
|
@ -1376,7 +1379,7 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
LOG_TEE("slot unavailable\n");
|
LOG_TEE("slot unavailable\n");
|
||||||
// send error result
|
// send error result
|
||||||
send_error(task.id, "slot unavaliable");
|
send_error(task.id, "slot unavailable");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1388,6 +1391,7 @@ struct llama_server_context
|
||||||
slot->reset();
|
slot->reset();
|
||||||
|
|
||||||
slot->infill = task.infill_mode;
|
slot->infill = task.infill_mode;
|
||||||
|
slot->embedding = task.embedding_mode;
|
||||||
slot->task_id = task.id;
|
slot->task_id = task.id;
|
||||||
|
|
||||||
if (!launch_slot_with_data(slot, task.data))
|
if (!launch_slot_with_data(slot, task.data))
|
||||||
|
@ -1695,7 +1699,7 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
|
|
||||||
// prompt evaluated for embedding
|
// prompt evaluated for embedding
|
||||||
if (params.embedding)
|
if (slot.embedding)
|
||||||
{
|
{
|
||||||
send_embedding(slot);
|
send_embedding(slot);
|
||||||
slot.release();
|
slot.release();
|
||||||
|
@ -2274,7 +2278,7 @@ int main(int argc, char **argv)
|
||||||
svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
|
svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||||
{
|
{
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
const int task_id = llama.request_completion(data, false);
|
const int task_id = llama.request_completion(data, false, false);
|
||||||
if (!json_value(data, "stream", false)) {
|
if (!json_value(data, "stream", false)) {
|
||||||
std::string completion_text;
|
std::string completion_text;
|
||||||
task_result result = llama.next_result(task_id);
|
task_result result = llama.next_result(task_id);
|
||||||
|
@ -2329,7 +2333,7 @@ int main(int argc, char **argv)
|
||||||
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
|
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||||
{
|
{
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
const int task_id = llama.request_completion(data, true);
|
const int task_id = llama.request_completion(data, true, false);
|
||||||
if (!json_value(data, "stream", false)) {
|
if (!json_value(data, "stream", false)) {
|
||||||
std::string completion_text;
|
std::string completion_text;
|
||||||
task_result result = llama.next_result(task_id);
|
task_result result = llama.next_result(task_id);
|
||||||
|
@ -2433,7 +2437,7 @@ int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
prompt = "";
|
prompt = "";
|
||||||
}
|
}
|
||||||
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false);
|
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true);
|
||||||
task_result result = llama.next_result(task_id);
|
task_result result = llama.next_result(task_id);
|
||||||
return res.set_content(result.result_json.dump(), "application/json");
|
return res.set_content(result.result_json.dump(), "application/json");
|
||||||
});
|
});
|
||||||
|
|
11
ggml-metal.m
11
ggml-metal.m
|
@ -238,14 +238,17 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
// load kernels
|
// load kernels
|
||||||
{
|
{
|
||||||
NSError * error = nil;
|
NSError * error = nil;
|
||||||
#define GGML_METAL_ADD_KERNEL(name) \
|
|
||||||
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
/*
|
||||||
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
|
||||||
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
||||||
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
|
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
|
||||||
(int) ctx->pipeline_##name.threadExecutionWidth); \
|
(int) ctx->pipeline_##name.threadExecutionWidth); \
|
||||||
|
*/
|
||||||
|
#define GGML_METAL_ADD_KERNEL(name) \
|
||||||
|
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
||||||
|
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
||||||
if (error) { \
|
if (error) { \
|
||||||
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||||
return NULL; \
|
return NULL; \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
2
ggml.h
2
ggml.h
|
@ -716,7 +716,7 @@ extern "C" {
|
||||||
// Context tensor enumeration and lookup
|
// Context tensor enumeration and lookup
|
||||||
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
||||||
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||||
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
GGML_API struct ggml_tensor * ggml_get_tensor (struct ggml_context * ctx, const char * name);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||||
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||||
|
|
7
llama.h
7
llama.h
|
@ -600,6 +600,13 @@ extern "C" {
|
||||||
float p,
|
float p,
|
||||||
size_t min_keep);
|
size_t min_keep);
|
||||||
|
|
||||||
|
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
|
||||||
|
LLAMA_API void llama_sample_min_p(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
llama_token_data_array * candidates,
|
||||||
|
float p,
|
||||||
|
size_t min_keep);
|
||||||
|
|
||||||
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
||||||
LLAMA_API void llama_sample_tail_free(
|
LLAMA_API void llama_sample_tail_free(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
|
|
391
scripts/server-llm.sh
Normal file
391
scripts/server-llm.sh
Normal file
|
@ -0,0 +1,391 @@
|
||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Helper script for deploying llama.cpp server with a single Bash command
|
||||||
|
#
|
||||||
|
# - Works on Linux and macOS
|
||||||
|
# - Supports: CPU, CUDA, Metal, OpenCL
|
||||||
|
# - Can run all GGUF models from HuggingFace
|
||||||
|
# - Can serve requests in parallel
|
||||||
|
# - Always builds latest llama.cpp from GitHub
|
||||||
|
#
|
||||||
|
# Limitations
|
||||||
|
#
|
||||||
|
# - Chat templates are poorly supported (base models recommended)
|
||||||
|
# - Might be unstable!
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
|
||||||
|
#
|
||||||
|
# --port: port number, default is 8888
|
||||||
|
# --repo: path to a repo containing GGUF model files
|
||||||
|
# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input
|
||||||
|
# --backend: cpu, cuda, metal, opencl, depends on the OS
|
||||||
|
# --gpu-id: gpu id, default is 0
|
||||||
|
# --n-parallel: number of parallel requests, default is 8
|
||||||
|
# --n-kv: KV cache size, default is 4096
|
||||||
|
# --verbose: verbose output
|
||||||
|
#
|
||||||
|
# Example:
|
||||||
|
#
|
||||||
|
# bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
|
||||||
|
#
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# required utils: curl, git, make
|
||||||
|
if ! command -v curl &> /dev/null; then
|
||||||
|
printf "[-] curl not found\n"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if ! command -v git &> /dev/null; then
|
||||||
|
printf "[-] git not found\n"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if ! command -v make &> /dev/null; then
|
||||||
|
printf "[-] make not found\n"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# parse arguments
|
||||||
|
port=8888
|
||||||
|
repo=""
|
||||||
|
wtype=""
|
||||||
|
backend="cpu"
|
||||||
|
|
||||||
|
# if macOS, use metal backend by default
|
||||||
|
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||||
|
backend="metal"
|
||||||
|
elif command -v nvcc &> /dev/null; then
|
||||||
|
backend="cuda"
|
||||||
|
fi
|
||||||
|
|
||||||
|
gpu_id=0
|
||||||
|
n_parallel=8
|
||||||
|
n_kv=4096
|
||||||
|
verbose=0
|
||||||
|
|
||||||
|
function print_usage {
|
||||||
|
printf "Usage:\n"
|
||||||
|
printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
|
||||||
|
printf " --port: port number, default is 8888\n"
|
||||||
|
printf " --repo: path to a repo containing GGUF model files\n"
|
||||||
|
printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
|
||||||
|
printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n"
|
||||||
|
printf " --gpu-id: gpu id, default is 0\n"
|
||||||
|
printf " --n-parallel: number of parallel requests, default is 8\n"
|
||||||
|
printf " --n-kv: KV cache size, default is 4096\n"
|
||||||
|
printf " --verbose: verbose output\n\n"
|
||||||
|
printf "Example:\n\n"
|
||||||
|
printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
|
||||||
|
}
|
||||||
|
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
key="$1"
|
||||||
|
case $key in
|
||||||
|
--port)
|
||||||
|
port="$2"
|
||||||
|
shift
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--repo)
|
||||||
|
repo="$2"
|
||||||
|
shift
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--wtype)
|
||||||
|
wtype="$2"
|
||||||
|
shift
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--backend)
|
||||||
|
backend="$2"
|
||||||
|
shift
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--gpu-id)
|
||||||
|
gpu_id="$2"
|
||||||
|
shift
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--n-parallel)
|
||||||
|
n_parallel="$2"
|
||||||
|
shift
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--n-kv)
|
||||||
|
n_kv="$2"
|
||||||
|
shift
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--verbose)
|
||||||
|
verbose=1
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--help)
|
||||||
|
print_usage
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Unknown argument: $key"
|
||||||
|
print_usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# available weights types
|
||||||
|
wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
|
||||||
|
|
||||||
|
wfiles=()
|
||||||
|
for wt in "${wtypes[@]}"; do
|
||||||
|
wfiles+=("")
|
||||||
|
done
|
||||||
|
|
||||||
|
# sample repos
|
||||||
|
repos=(
|
||||||
|
"https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
|
||||||
|
)
|
||||||
|
|
||||||
|
printf "\n"
|
||||||
|
printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
|
||||||
|
printf " Based on the options that follow, the script might download a model file\n"
|
||||||
|
printf " from the internet, which can be a few GBs in size. The script will also\n"
|
||||||
|
printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n"
|
||||||
|
printf "\n"
|
||||||
|
printf " Upon success, an HTTP server will be started and it will serve the selected\n"
|
||||||
|
printf " model using llama.cpp for demonstration purposes.\n"
|
||||||
|
printf "\n"
|
||||||
|
printf " Please note:\n"
|
||||||
|
printf "\n"
|
||||||
|
printf " - All new data will be stored in the current folder\n"
|
||||||
|
printf " - The server will be listening on all network interfaces\n"
|
||||||
|
printf " - The server will run with default settings which are not always optimal\n"
|
||||||
|
printf " - Do not judge the quality of a model based on the results from this script\n"
|
||||||
|
printf " - Do not use this script to benchmark llama.cpp\n"
|
||||||
|
printf " - Do not use this script in production\n"
|
||||||
|
printf " - This script is only for demonstration purposes\n"
|
||||||
|
printf "\n"
|
||||||
|
printf " If you don't know what you are doing, please press Ctrl-C to abort now\n"
|
||||||
|
printf "\n"
|
||||||
|
printf " Press Enter to continue ...\n\n"
|
||||||
|
|
||||||
|
read
|
||||||
|
|
||||||
|
if [[ -z "$repo" ]]; then
|
||||||
|
printf "[+] No repo provided from the command line\n"
|
||||||
|
printf " Please select a number from the list below or enter an URL:\n\n"
|
||||||
|
|
||||||
|
is=0
|
||||||
|
for r in "${repos[@]}"; do
|
||||||
|
printf " %2d) %s\n" $is "$r"
|
||||||
|
is=$((is+1))
|
||||||
|
done
|
||||||
|
|
||||||
|
# ask for repo until index of sample repo is provided or an URL
|
||||||
|
while [[ -z "$repo" ]]; do
|
||||||
|
printf "\n Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
|
||||||
|
read -p "[+] Select repo: " repo
|
||||||
|
|
||||||
|
# check if the input is a number
|
||||||
|
if [[ "$repo" =~ ^[0-9]+$ ]]; then
|
||||||
|
if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
|
||||||
|
repo="${repos[$repo]}"
|
||||||
|
else
|
||||||
|
printf "[-] Invalid repo index: %s\n" "$repo"
|
||||||
|
repo=""
|
||||||
|
fi
|
||||||
|
elif [[ "$repo" =~ ^https?:// ]]; then
|
||||||
|
repo="$repo"
|
||||||
|
else
|
||||||
|
printf "[-] Invalid repo URL: %s\n" "$repo"
|
||||||
|
repo=""
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# remove suffix
|
||||||
|
repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
|
||||||
|
|
||||||
|
printf "[+] Checking for GGUF model files in %s\n" "$repo"
|
||||||
|
|
||||||
|
# find GGUF files in the source
|
||||||
|
# TODO: better logic
|
||||||
|
model_tree="${repo%/}/tree/main"
|
||||||
|
model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
|
||||||
|
|
||||||
|
# list all files in the provided git repo
|
||||||
|
printf "[+] Model files:\n\n"
|
||||||
|
for file in $model_files; do
|
||||||
|
# determine iw by grepping the filename with wtypes
|
||||||
|
iw=-1
|
||||||
|
is=0
|
||||||
|
for wt in "${wtypes[@]}"; do
|
||||||
|
# uppercase
|
||||||
|
ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
|
||||||
|
if [[ "$ufile" =~ "$wt" ]]; then
|
||||||
|
iw=$is
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
is=$((is+1))
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ $iw -eq -1 ]]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
wfiles[$iw]="$file"
|
||||||
|
|
||||||
|
have=" "
|
||||||
|
if [[ -f "$file" ]]; then
|
||||||
|
have="*"
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf " %2d) %s %s\n" $iw "$have" "$file"
|
||||||
|
done
|
||||||
|
|
||||||
|
# ask for weights type until provided and available
|
||||||
|
while [[ -z "$wtype" ]]; do
|
||||||
|
printf "\n"
|
||||||
|
read -p "[+] Select weight type: " wtype
|
||||||
|
wfile="${wfiles[$wtype]}"
|
||||||
|
|
||||||
|
if [[ -z "$wfile" ]]; then
|
||||||
|
printf "[-] Invalid weight type: %s\n" "$wtype"
|
||||||
|
wtype=""
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
|
||||||
|
|
||||||
|
url="${repo%/}/resolve/main/$wfile"
|
||||||
|
|
||||||
|
# check file if the model has been downloaded before
|
||||||
|
chk="$wfile.chk"
|
||||||
|
|
||||||
|
# check if we should download the file
|
||||||
|
# - if $wfile does not exist
|
||||||
|
# - if $wfile exists but $chk does not exist
|
||||||
|
# - if $wfile exists and $chk exists but $wfile is newer than $chk
|
||||||
|
# TODO: better logic using git lfs info
|
||||||
|
|
||||||
|
do_download=0
|
||||||
|
|
||||||
|
if [[ ! -f "$wfile" ]]; then
|
||||||
|
do_download=1
|
||||||
|
elif [[ ! -f "$chk" ]]; then
|
||||||
|
do_download=1
|
||||||
|
elif [[ "$wfile" -nt "$chk" ]]; then
|
||||||
|
do_download=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $do_download -eq 1 ]]; then
|
||||||
|
printf "[+] Downloading weights from %s\n" "$url"
|
||||||
|
|
||||||
|
# download the weights file
|
||||||
|
curl -o "$wfile" -# -L "$url"
|
||||||
|
|
||||||
|
# create a check file if successful
|
||||||
|
if [[ $? -eq 0 ]]; then
|
||||||
|
printf "[+] Creating check file %s\n" "$chk"
|
||||||
|
touch "$chk"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
printf "[+] Using cached weights %s\n" "$wfile"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# get latest llama.cpp and build
|
||||||
|
|
||||||
|
printf "[+] Downloading latest llama.cpp\n"
|
||||||
|
|
||||||
|
llama_cpp_dir="__llama_cpp_port_${port}__"
|
||||||
|
|
||||||
|
if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
|
||||||
|
# if the dir exists and there isn't a file "__ggml_script__" in it, abort
|
||||||
|
printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
|
||||||
|
printf "[-] Please remove it and try again\n"
|
||||||
|
exit 1
|
||||||
|
elif [[ -d "$llama_cpp_dir" ]]; then
|
||||||
|
printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
|
||||||
|
printf "[+] Using cached llama.cpp\n"
|
||||||
|
|
||||||
|
cd "$llama_cpp_dir"
|
||||||
|
git reset --hard
|
||||||
|
git fetch
|
||||||
|
git checkout origin/master
|
||||||
|
|
||||||
|
cd ..
|
||||||
|
else
|
||||||
|
printf "[+] Cloning llama.cpp\n"
|
||||||
|
|
||||||
|
git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# mark that that the directory is made by this script
|
||||||
|
touch "$llama_cpp_dir/__ggml_script__"
|
||||||
|
|
||||||
|
if [[ $verbose -eq 1 ]]; then
|
||||||
|
set -x
|
||||||
|
fi
|
||||||
|
|
||||||
|
# build
|
||||||
|
cd "$llama_cpp_dir"
|
||||||
|
|
||||||
|
make clean
|
||||||
|
|
||||||
|
log="--silent"
|
||||||
|
if [[ $verbose -eq 1 ]]; then
|
||||||
|
log=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$backend" == "cuda" ]]; then
|
||||||
|
printf "[+] Building with CUDA backend\n"
|
||||||
|
LLAMA_CUBLAS=1 make -j server $log
|
||||||
|
elif [[ "$backend" == "cpu" ]]; then
|
||||||
|
printf "[+] Building with CPU backend\n"
|
||||||
|
make -j server $log
|
||||||
|
elif [[ "$backend" == "metal" ]]; then
|
||||||
|
printf "[+] Building with Metal backend\n"
|
||||||
|
make -j server $log
|
||||||
|
elif [[ "$backend" == "opencl" ]]; then
|
||||||
|
printf "[+] Building with OpenCL backend\n"
|
||||||
|
LLAMA_CLBLAST=1 make -j server $log
|
||||||
|
else
|
||||||
|
printf "[-] Unknown backend: %s\n" "$backend"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# run the server
|
||||||
|
|
||||||
|
printf "[+] Running server\n"
|
||||||
|
|
||||||
|
args=""
|
||||||
|
if [[ "$backend" == "cuda" ]]; then
|
||||||
|
export CUDA_VISIBLE_DEVICES=$gpu_id
|
||||||
|
args="-ngl 999"
|
||||||
|
elif [[ "$backend" == "cpu" ]]; then
|
||||||
|
args="-ngl 0"
|
||||||
|
elif [[ "$backend" == "metal" ]]; then
|
||||||
|
args="-ngl 999"
|
||||||
|
elif [[ "$backend" == "opencl" ]]; then
|
||||||
|
args="-ngl 999"
|
||||||
|
else
|
||||||
|
printf "[-] Unknown backend: %s\n" "$backend"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $verbose -eq 1 ]]; then
|
||||||
|
args="$args --verbose"
|
||||||
|
fi
|
||||||
|
|
||||||
|
./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
|
||||||
|
|
||||||
|
exit 0
|
Loading…
Add table
Add a link
Reference in a new issue