Merge pull request #18 from anon998/update-readme
Update readme + parse --mlock and --no-mmap
This commit is contained in:
commit
df2ecc942a
2 changed files with 99 additions and 101 deletions
|
@ -1,14 +1,22 @@
|
|||
# llama.cpp/example/server
|
||||
|
||||
This example allow you to have a llama.cpp http server to interact from a web page or consume the API.
|
||||
This example demonstrates a simple HTTP API server to interact with llama.cpp.
|
||||
|
||||
Command line options:
|
||||
|
||||
- `--threads N`, `-t N`: use N threads.
|
||||
- `--threads N`, `-t N`: Set the number of threads to use during computation.
|
||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||
- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
||||
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
||||
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`;
|
||||
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
|
||||
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
|
||||
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
|
||||
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
|
||||
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
|
||||
- `--port`: Set the port to listen. Default: `8080`.
|
||||
|
||||
## Quick Start
|
||||
|
@ -79,10 +87,7 @@ node .
|
|||
|
||||
## API Endpoints
|
||||
|
||||
You can interact with this API Endpoints.
|
||||
This implementations just support chat style interaction.
|
||||
|
||||
- **POST** `hostname:port/completion`: Setting up the Llama Context to begin the completions tasks.
|
||||
- **POST** `/completion`: Given a prompt, it returns the predicted completion.
|
||||
|
||||
*Options:*
|
||||
|
||||
|
@ -102,10 +107,35 @@ This implementations just support chat style interaction.
|
|||
`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate.
|
||||
|
||||
`stop`: Specify the strings that indicate a stop.
|
||||
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration.
|
||||
Default: `[]`
|
||||
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
|
||||
|
||||
- **POST** `hostname:port/tokenize`: Tokenize a given text
|
||||
`tfs_z`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
|
||||
|
||||
`typical_p`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
|
||||
|
||||
`repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1).
|
||||
|
||||
`repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
|
||||
|
||||
`penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true).
|
||||
|
||||
`presence_penalty`: Repeat alpha presence penalty (default: 0.0, 0.0 = disabled).
|
||||
|
||||
`frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled);
|
||||
|
||||
`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
|
||||
|
||||
`mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0).
|
||||
|
||||
`mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).
|
||||
|
||||
`seed`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
|
||||
|
||||
`ignore_eos`: Ignore end of stream token and continue generating (default: false).
|
||||
|
||||
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `logit-bias: [[15043,1]]` to increase the likelihood of the token 'Hello', or `logit-bias: [[15043,-1]]` to decrease its likelihood. Setting the value to false, `logit-bias: [[15043,false]]` ensures that the token `Hello` is never produced (default: []).
|
||||
|
||||
- **POST** `/tokenize`: Tokenize a given text.
|
||||
|
||||
*Options:*
|
||||
|
||||
|
|
|
@ -422,7 +422,6 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms, con
|
|||
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||
if (llama_mlock_supported())
|
||||
{
|
||||
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
|
@ -447,100 +446,73 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms, con
|
|||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
void server_params_parse(int argc, char **argv, server_params &sparams, gpt_params ¶ms)
|
||||
void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
gpt_params ¶ms)
|
||||
{
|
||||
gpt_params default_params;
|
||||
server_params default_sparams;
|
||||
std::string arg;
|
||||
bool invalid_param = false;
|
||||
|
||||
for (int i = 1; i < argc; i++)
|
||||
{
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
if (arg == "--port")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
if (arg == "--port") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.port = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "--host")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
} else if (arg == "--host") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.hostname = argv[i];
|
||||
}
|
||||
else if (arg == "--timeout" || arg == "-to")
|
||||
{
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.read_timeout = std::stoi(argv[i]);
|
||||
sparams.write_timeout = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "-m" || arg == "--model")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
} else if (arg == "--timeout" || arg == "-to") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.read_timeout = std::stoi(argv[i]);
|
||||
sparams.write_timeout = std::stoi(argv[i]);
|
||||
} else if (arg == "-m" || arg == "--model") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.model = argv[i];
|
||||
}
|
||||
else if (arg == "-a" || arg == "--alias")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
} else if (arg == "-a" || arg == "--alias") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.model_alias = argv[i];
|
||||
}
|
||||
else if (arg == "-h" || arg == "--help")
|
||||
{
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
server_print_usage(argc, argv, default_params, default_sparams);
|
||||
exit(0);
|
||||
}
|
||||
else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
} else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "--memory-f32" || arg == "--memory_f32")
|
||||
{
|
||||
} else if (arg == "--memory-f32" || arg == "--memory_f32") {
|
||||
params.memory_f16 = false;
|
||||
}
|
||||
else if (arg == "--threads" || arg == "-t")
|
||||
{
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_threads = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "-b" || arg == "--batch-size")
|
||||
{
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_batch = std::stoi(argv[i]);
|
||||
params.n_batch = std::min(512, params.n_batch);
|
||||
}
|
||||
else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
} else if (arg == "--threads" || arg == "-t") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_threads = std::stoi(argv[i]);
|
||||
} else if (arg == "-b" || arg == "--batch-size") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_batch = std::stoi(argv[i]);
|
||||
params.n_batch = std::min(512, params.n_batch);
|
||||
} else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
|
@ -550,37 +522,33 @@ void server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
|
|||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
#endif
|
||||
}
|
||||
else if (arg == "--lora")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter = argv[i];
|
||||
params.use_mmap = false;
|
||||
}
|
||||
else if (arg == "--lora-base")
|
||||
{
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_base = argv[i];
|
||||
} else if (arg == "--lora") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter = argv[i];
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--lora-base") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_base = argv[i];
|
||||
} else if (arg == "-v" || arg == "--verbose") {
|
||||
sparams.verbose = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
sparams.verbose = true;
|
||||
} else if (arg == "--mlock") {
|
||||
params.use_mlock = true;
|
||||
} else if (arg == "--no-mmap") {
|
||||
params.use_mmap = false;
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
server_print_usage(argc, argv, default_params, default_sparams);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (invalid_param)
|
||||
{
|
||||
if (invalid_param) {
|
||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||
server_print_usage(argc, argv, default_params, default_sparams);
|
||||
exit(1);
|
||||
|
@ -757,8 +725,6 @@ bool parse_options_completion(json body, llama_server_context& llama, Response &
|
|||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
llama_init_backend();
|
||||
|
||||
// own arguments required by this example
|
||||
gpt_params params;
|
||||
server_params sparams;
|
||||
|
@ -776,6 +742,8 @@ int main(int argc, char **argv)
|
|||
params.model_alias = params.model;
|
||||
}
|
||||
|
||||
llama_init_backend();
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n\n", params.n_threads,
|
||||
std::thread::hardware_concurrency(), llama_print_system_info());
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue