Merge branch 'master' into server-probs
This commit is contained in:
commit
006e74a493
5 changed files with 23 additions and 4 deletions
|
@ -16,6 +16,7 @@ Command line options:
|
||||||
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
|
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
|
||||||
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
|
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
|
||||||
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
|
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
|
||||||
|
- `--numa`: Attempt optimizations that help on some NUMA systems.
|
||||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||||
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
|
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
#include "index.html.hpp"
|
#include "index.html.hpp"
|
||||||
#include "index.js.hpp"
|
#include "index.js.hpp"
|
||||||
#include "completion.js.hpp"
|
#include "completion.js.hpp"
|
||||||
|
#include "json-schema-to-grammar.mjs.hpp"
|
||||||
|
|
||||||
#ifndef SERVER_VERBOSE
|
#ifndef SERVER_VERBOSE
|
||||||
#define SERVER_VERBOSE 1
|
#define SERVER_VERBOSE 1
|
||||||
|
@ -666,6 +667,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
{
|
{
|
||||||
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||||
}
|
}
|
||||||
|
fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n");
|
||||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
fprintf(stdout, " -ngl N, --n-gpu-layers N\n");
|
fprintf(stdout, " -ngl N, --n-gpu-layers N\n");
|
||||||
fprintf(stdout, " number of layers to store in VRAM\n");
|
fprintf(stdout, " number of layers to store in VRAM\n");
|
||||||
|
@ -940,6 +942,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
{
|
{
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
}
|
}
|
||||||
|
else if (arg == "--numa")
|
||||||
|
{
|
||||||
|
params.numa = true;
|
||||||
|
}
|
||||||
else if (arg == "--embedding")
|
else if (arg == "--embedding")
|
||||||
{
|
{
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
|
@ -1213,6 +1219,12 @@ int main(int argc, char **argv)
|
||||||
res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
|
res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
|
||||||
return false; });
|
return false; });
|
||||||
|
|
||||||
|
// this is only called if no index.html is found in the public --path
|
||||||
|
svr.Get("/json-schema-to-grammar.mjs", [](const Request &, Response &res)
|
||||||
|
{
|
||||||
|
res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript");
|
||||||
|
return false; });
|
||||||
|
|
||||||
svr.Post("/completion", [&llama](const Request &req, Response &res)
|
svr.Post("/completion", [&llama](const Request &req, Response &res)
|
||||||
{
|
{
|
||||||
auto lock = llama.lock();
|
auto lock = llama.lock();
|
||||||
|
|
|
@ -126,7 +126,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
|
ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
|
||||||
if (error) {
|
if (error) {
|
||||||
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
exit(1);
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
@ -144,7 +144,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
|
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
|
||||||
if (error) {
|
if (error) {
|
||||||
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
exit(1);
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_QKK_64
|
#ifdef GGML_QKK_64
|
||||||
|
@ -156,7 +156,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
#endif
|
#endif
|
||||||
if (error) {
|
if (error) {
|
||||||
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
exit(1);
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -3337,6 +3337,12 @@ struct llama_context * llama_new_context_with_model(
|
||||||
// this allocates all Metal resources and memory buffers
|
// this allocates all Metal resources and memory buffers
|
||||||
ctx->ctx_metal = ggml_metal_init(1);
|
ctx->ctx_metal = ggml_metal_init(1);
|
||||||
|
|
||||||
|
if (!ctx->ctx_metal) {
|
||||||
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
||||||
|
llama_free(ctx);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
void * data_ptr = NULL;
|
void * data_ptr = NULL;
|
||||||
size_t data_size = 0;
|
size_t data_size = 0;
|
||||||
|
|
||||||
|
|
2
llama.h
2
llama.h
|
@ -97,7 +97,7 @@ extern "C" {
|
||||||
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
||||||
// if it exists.
|
// if it exists.
|
||||||
// It might not exist for progress report where '.' is output repeatedly.
|
// It might not exist for progress report where '.' is output repeatedly.
|
||||||
typedef void (*llama_log_callback)(llama_log_level level, const char * text, void * user_data);
|
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
||||||
|
|
||||||
struct llama_context_params {
|
struct llama_context_params {
|
||||||
uint32_t seed; // RNG seed, -1 for random
|
uint32_t seed; // RNG seed, -1 for random
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue