diff --git a/CMakeLists.txt b/CMakeLists.txt index 2846d9b94..cc7560a7a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -250,6 +250,15 @@ if (LLAMA_CUBLAS) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) endif() + if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + if (LLAMA_CUDA_DMMV_F16) + set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics + else() + set(CMAKE_CUDA_ARCHITECTURES "52") # lowest CUDA 12 standard + endif() + endif() + message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") + else() message(WARNING "cuBLAS not found") endif() @@ -493,22 +502,6 @@ if (BUILD_SHARED_LIBS) endif() endif() -if (GGML_SOURCES_CUDA) - message(STATUS "GGML CUDA sources found, configuring CUDA architecture") - set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES "native") - set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") - - set_property(TARGET ggml_static PROPERTY CUDA_ARCHITECTURES "native") - set_property(TARGET ggml_static PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") - - if (BUILD_SHARED_LIBS) - set_property(TARGET ggml_shared PROPERTY CUDA_ARCHITECTURES "native") - set_property(TARGET ggml_shared PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") - endif() - - set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES "native") -endif() - # # programs, examples and tests diff --git a/README.md b/README.md index 2d05de333..ace588606 100644 --- a/README.md +++ b/README.md @@ -9,12 +9,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ **Hot topics:** +- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1 - Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729 -- GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642 -- High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684 -- Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607 -- Training LLaMA models from scratch: https://github.com/ggerganov/llama.cpp/pull/1652 -- CPU threading improvements: https://github.com/ggerganov/llama.cpp/pull/1632
Table of Contents @@ -344,7 +340,7 @@ Building the program with BLAS support may lead to some performance improvements | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | | LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | | LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. | - | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value 2 1 can improve performance for slow GPUs. | + | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | - #### CLBlast @@ -378,7 +374,7 @@ Building the program with BLAS support may lead to some performance improvements ```sh git clone https://github.com/CNugteren/CLBlast.git mkdir CLBlast/build - cd CLBLast/build + cd CLBlast/build cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF cmake --build . --config Release cmake --install . --prefix /some/path diff --git a/examples/server/README.md b/examples/server/README.md index 474a28b20..fa95c0044 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -21,6 +21,7 @@ Command line options: - `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`. - `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. - `--port`: Set the port to listen. Default: `8080`. +- `--embedding`: Enable embedding extraction, Default: disabled. ## Build @@ -119,14 +120,14 @@ node . `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9). - `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. (default: 128, -1 = infinity). + `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: 128, -1 = infinity). `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt. `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. - `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. + `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does. `stop`: Specify a JSON array of stopping strings. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []). @@ -163,6 +164,14 @@ node . `content`: Set the text to tokenize. + Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`. + +- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does. + + *Options:* + + `content`: Set the text to process. + ## More examples ### Interactive mode diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 12d4e2fa4..c0984aadb 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -254,6 +254,11 @@ struct llama_server_context { n_past += n_eval; } + if (params.n_predict == 0) { + has_next_token = false; + return llama_token_eos(); + } + // out of user input, sample next token const float temp = params.temp; const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; @@ -419,6 +424,19 @@ struct llama_server_context { return token_text; } + + std::vector getEmbedding() { + static const int n_embd = llama_n_embd(ctx); + if (!params.embedding) { + LOG_WARNING("embedding disabled", { + { "params.embedding", params.embedding }, + }); + return std::vector(n_embd, 0.0f); + } + const float * data = llama_get_embeddings(ctx); + std::vector embedding(data, data + n_embd); + return embedding; + } }; static void server_print_usage(const char * argv0, const gpt_params & params, @@ -457,6 +475,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port); fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); + fprintf(stderr, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); fprintf(stderr, "\n"); } @@ -603,6 +622,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, params.use_mlock = true; } else if (arg == "--no-mmap") { params.use_mmap = false; + } else if (arg == "--embedding") { + params.embedding = true; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); server_print_usage(argv[0], default_params, default_sparams); @@ -646,6 +667,12 @@ static json format_generation_settings(llama_server_context & llama) { }; } +static json format_embedding_response(llama_server_context & llama) { + return json { + { "embedding", llama.getEmbedding() }, + }; +} + static json format_final_response(llama_server_context & llama, const std::string & content) { return json { { "content", content }, @@ -881,12 +908,27 @@ int main(int argc, char ** argv) { svr.Post("/tokenize", [&llama](const Request & req, Response & res) { const json body = json::parse(req.body); - const std::string content = body["content"].get(); + const std::string content = body.value("content", ""); const std::vector tokens = llama_tokenize(llama.ctx, content, false); const json data = format_tokenizer_response(tokens); return res.set_content(data.dump(), "application/json"); }); + svr.Post("/embedding", [&llama](const Request & req, Response & res) { + const json body = json::parse(req.body); + + llama.rewind(); + llama_reset_timings(llama.ctx); + llama.params.prompt = body.value("content", ""); + llama.params.n_predict = 0; + llama.loadPrompt(); + llama.beginCompletion(); + llama.doCompletion(); + + const json data = format_embedding_response(llama); + return res.set_content(data.dump(), "application/json"); + }); + svr.set_logger(log_server_request); svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) { diff --git a/ggml.c b/ggml.c index 14e08f9d6..4319683f5 100644 --- a/ggml.c +++ b/ggml.c @@ -18237,7 +18237,6 @@ GGML_API void ggml_opt_init( ggml_set_zero(opt->lbfgs.g); ggml_set_zero(opt->lbfgs.gp); ggml_set_zero(opt->lbfgs.d); - ggml_set_zero(opt->lbfgs.pf); if (opt->lbfgs.pf) { ggml_set_zero(opt->lbfgs.pf); } diff --git a/llama.cpp b/llama.cpp index 4a7d01b32..e597f5048 100644 --- a/llama.cpp +++ b/llama.cpp @@ -925,21 +925,21 @@ static bool kv_cache_init( struct llama_context_params llama_context_default_params() { struct llama_context_params result = { + /*.seed =*/ -1, /*.n_ctx =*/ 512, /*.n_batch =*/ 512, /*.gpu_layers =*/ 0, /*.main_gpu =*/ 0, /*.tensor_split =*/ {0}, + /*.progress_callback =*/ nullptr, + /*.progress_callback_user_data =*/ nullptr, /*.low_vram =*/ false, - /*.seed =*/ -1, /*.f16_kv =*/ true, /*.logits_all =*/ false, /*.vocab_only =*/ false, /*.use_mmap =*/ true, /*.use_mlock =*/ false, /*.embedding =*/ false, - /*.progress_callback =*/ nullptr, - /*.progress_callback_user_data =*/ nullptr, }; return result; diff --git a/llama.h b/llama.h index 1241ba6c0..0de530d45 100644 --- a/llama.h +++ b/llama.h @@ -71,28 +71,27 @@ extern "C" { typedef void (*llama_progress_callback)(float progress, void *ctx); - struct llama_context_params { + struct llama_context_params { + int seed; // RNG seed, -1 for random int n_ctx; // text context int n_batch; // prompt processing batch size int n_gpu_layers; // number of layers to store in VRAM int main_gpu; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs - bool low_vram; // if true, reduce VRAM usage at the cost of performance - int seed; // RNG seed, -1 for random + // called with a progress value between 0 and 1, pass NULL to disable + llama_progress_callback progress_callback; + // context pointer passed to the progress callback + void * progress_callback_user_data; + // Keep the booleans together to avoid misalignment during copy-by-value. + bool low_vram; // if true, reduce VRAM usage at the cost of performance bool f16_kv; // use fp16 for KV cache bool logits_all; // the llama_eval() call computes all logits, not just the last one bool vocab_only; // only load the vocabulary, no weights bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM bool embedding; // embedding mode only - - // called with a progress value between 0 and 1, pass NULL to disable - llama_progress_callback progress_callback; - // context pointer passed to the progress callback - void * progress_callback_user_data; }; - // model file types enum llama_ftype { LLAMA_FTYPE_ALL_F32 = 0,