From e1e2be21469d4e4aaeb7ffa2a4ba730a5d5cd2f5 Mon Sep 17 00:00:00 2001 From: anon Date: Fri, 2 Jun 2023 17:47:42 -0300 Subject: [PATCH 1/4] remove --keep from help text --- examples/server/server.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5e66a7b31..ecae8ecc3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -422,7 +422,6 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms, con fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); - fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); if (llama_mlock_supported()) { fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); From a6ed390cc6c39f5396bb9fd16b2acc605dfa02a8 Mon Sep 17 00:00:00 2001 From: anon Date: Fri, 2 Jun 2023 17:48:29 -0300 Subject: [PATCH 2/4] update readme --- examples/server/README.md | 50 +++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 334d53fa9..0f3fe22e8 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -1,14 +1,22 @@ # llama.cpp/example/server -This example allow you to have a llama.cpp http server to interact from a web page or consume the API. +This example demonstrates a simple HTTP API server to interact with llama.cpp. Command line options: -- `--threads N`, `-t N`: use N threads. +- `--threads N`, `-t N`: Set the number of threads to use during computation. - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). +- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. -- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`; +- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`. +- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. +- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. +- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. +- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. +- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. +- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`. +- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. - `--port`: Set the port to listen. Default: `8080`. ## Quick Start @@ -79,10 +87,7 @@ node . ## API Endpoints -You can interact with this API Endpoints. -This implementations just support chat style interaction. - -- **POST** `hostname:port/completion`: Setting up the Llama Context to begin the completions tasks. +- **POST** `/completion`: Given a prompt, it returns the predicted completion. *Options:* @@ -102,10 +107,35 @@ This implementations just support chat style interaction. `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. `stop`: Specify the strings that indicate a stop. - These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. - Default: `[]` + These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []). -- **POST** `hostname:port/tokenize`: Tokenize a given text + `tfs_z`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled). + + `typical_p`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled). + + `repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1). + + `repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size). + + `penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true). + + `presence_penalty`: Repeat alpha presence penalty (default: 0.0, 0.0 = disabled). + + `frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled); + + `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0). + + `mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0). + + `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1). + + `seed`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed). + + `ignore_eos`: Ignore end of stream token and continue generating (default: false). + + `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `logit-bias: [[15043,1]]` to increase the likelihood of the token 'Hello', or `logit-bias: [[15043,-1]]` to decrease its likelihood. Setting the value to false, `logit-bias: [[15043,false]]` ensures that the token `Hello` is never produced (default: []). + +- **POST** `/tokenize`: Tokenize a given text. *Options:* From 05a5a485b8e41737058df1815b33e2043ad1677c Mon Sep 17 00:00:00 2001 From: anon Date: Fri, 2 Jun 2023 17:52:04 -0300 Subject: [PATCH 3/4] make help text load faster --- examples/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ecae8ecc3..9b653a2f6 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -756,8 +756,6 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & int main(int argc, char **argv) { - llama_init_backend(); - // own arguments required by this example gpt_params params; server_params sparams; @@ -775,6 +773,8 @@ int main(int argc, char **argv) params.model_alias = params.model; } + llama_init_backend(); + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); fprintf(stderr, "system_info: n_threads = %d / %d | %s\n\n", params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); From 98ae2de0170955dbaaa4555b28ef8a9954a74eba Mon Sep 17 00:00:00 2001 From: anon Date: Fri, 2 Jun 2023 17:54:46 -0300 Subject: [PATCH 4/4] parse --mlock and --no-mmap + format --- examples/server/server.cpp | 145 +++++++++++++++---------------------- 1 file changed, 57 insertions(+), 88 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9b653a2f6..117a67826 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -446,100 +446,73 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms, con fprintf(stderr, "\n"); } -void server_params_parse(int argc, char **argv, server_params &sparams, gpt_params ¶ms) +void server_params_parse(int argc, char **argv, server_params &sparams, + gpt_params ¶ms) { gpt_params default_params; server_params default_sparams; std::string arg; bool invalid_param = false; - for (int i = 1; i < argc; i++) - { + for (int i = 1; i < argc; i++) { arg = argv[i]; - if (arg == "--port") - { - if (++i >= argc) - { + if (arg == "--port") { + if (++i >= argc) { invalid_param = true; break; } sparams.port = std::stoi(argv[i]); - } - else if (arg == "--host") - { - if (++i >= argc) - { + } else if (arg == "--host") { + if (++i >= argc) { invalid_param = true; break; } sparams.hostname = argv[i]; - } - else if (arg == "--timeout" || arg == "-to") - { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.read_timeout = std::stoi(argv[i]); - sparams.write_timeout = std::stoi(argv[i]); - } - else if (arg == "-m" || arg == "--model") - { - if (++i >= argc) - { + } else if (arg == "--timeout" || arg == "-to") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.read_timeout = std::stoi(argv[i]); + sparams.write_timeout = std::stoi(argv[i]); + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { invalid_param = true; break; } params.model = argv[i]; - } - else if (arg == "-a" || arg == "--alias") - { - if (++i >= argc) - { + } else if (arg == "-a" || arg == "--alias") { + if (++i >= argc) { invalid_param = true; break; } params.model_alias = argv[i]; - } - else if (arg == "-h" || arg == "--help") - { + } else if (arg == "-h" || arg == "--help") { server_print_usage(argc, argv, default_params, default_sparams); exit(0); - } - else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") - { - if (++i >= argc) - { + } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") { + if (++i >= argc) { invalid_param = true; break; } params.n_ctx = std::stoi(argv[i]); - } - else if (arg == "--memory-f32" || arg == "--memory_f32") - { + } else if (arg == "--memory-f32" || arg == "--memory_f32") { params.memory_f16 = false; - } - else if (arg == "--threads" || arg == "-t") - { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads = std::stoi(argv[i]); - } - else if (arg == "-b" || arg == "--batch-size") - { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_batch = std::stoi(argv[i]); - params.n_batch = std::min(512, params.n_batch); - } - else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") - { - if (++i >= argc) - { + } else if (arg == "--threads" || arg == "-t") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads = std::stoi(argv[i]); + } else if (arg == "-b" || arg == "--batch-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_batch = std::stoi(argv[i]); + params.n_batch = std::min(512, params.n_batch); + } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { invalid_param = true; break; } @@ -549,37 +522,33 @@ void server_params_parse(int argc, char **argv, server_params &sparams, gpt_para fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); #endif - } - else if (arg == "--lora") - { - if (++i >= argc) - { - invalid_param = true; - break; - } - params.lora_adapter = argv[i]; - params.use_mmap = false; - } - else if (arg == "--lora-base") - { - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_base = argv[i]; + } else if (arg == "--lora") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_adapter = argv[i]; + params.use_mmap = false; + } else if (arg == "--lora-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_base = argv[i]; } else if (arg == "-v" || arg == "--verbose") { - sparams.verbose = true; - } - else - { + sparams.verbose = true; + } else if (arg == "--mlock") { + params.use_mlock = true; + } else if (arg == "--no-mmap") { + params.use_mmap = false; + } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); server_print_usage(argc, argv, default_params, default_sparams); exit(1); } } - if (invalid_param) - { + if (invalid_param) { fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); server_print_usage(argc, argv, default_params, default_sparams); exit(1);