Merge branch 'master' into server-probs
This commit is contained in:
commit
006e74a493
5 changed files with 23 additions and 4 deletions
|
@ -16,6 +16,7 @@ Command line options:
|
|||
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
|
||||
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
|
||||
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
|
||||
- `--numa`: Attempt optimizations that help on some NUMA systems.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include "index.html.hpp"
|
||||
#include "index.js.hpp"
|
||||
#include "completion.js.hpp"
|
||||
#include "json-schema-to-grammar.mjs.hpp"
|
||||
|
||||
#ifndef SERVER_VERBOSE
|
||||
#define SERVER_VERBOSE 1
|
||||
|
@ -666,6 +667,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||
{
|
||||
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||
}
|
||||
fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n");
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
fprintf(stdout, " -ngl N, --n-gpu-layers N\n");
|
||||
fprintf(stdout, " number of layers to store in VRAM\n");
|
||||
|
@ -940,6 +942,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
{
|
||||
params.use_mmap = false;
|
||||
}
|
||||
else if (arg == "--numa")
|
||||
{
|
||||
params.numa = true;
|
||||
}
|
||||
else if (arg == "--embedding")
|
||||
{
|
||||
params.embedding = true;
|
||||
|
@ -1213,6 +1219,12 @@ int main(int argc, char **argv)
|
|||
res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
|
||||
return false; });
|
||||
|
||||
// this is only called if no index.html is found in the public --path
|
||||
svr.Get("/json-schema-to-grammar.mjs", [](const Request &, Response &res)
|
||||
{
|
||||
res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript");
|
||||
return false; });
|
||||
|
||||
svr.Post("/completion", [&llama](const Request &req, Response &res)
|
||||
{
|
||||
auto lock = llama.lock();
|
||||
|
|
|
@ -126,7 +126,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
|
||||
if (error) {
|
||||
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
exit(1);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
@ -144,7 +144,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
|
||||
if (error) {
|
||||
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
exit(1);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifdef GGML_QKK_64
|
||||
|
@ -156,7 +156,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
#endif
|
||||
if (error) {
|
||||
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
exit(1);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -3337,6 +3337,12 @@ struct llama_context * llama_new_context_with_model(
|
|||
// this allocates all Metal resources and memory buffers
|
||||
ctx->ctx_metal = ggml_metal_init(1);
|
||||
|
||||
if (!ctx->ctx_metal) {
|
||||
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
||||
llama_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void * data_ptr = NULL;
|
||||
size_t data_size = 0;
|
||||
|
||||
|
|
2
llama.h
2
llama.h
|
@ -97,7 +97,7 @@ extern "C" {
|
|||
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
||||
// if it exists.
|
||||
// It might not exist for progress report where '.' is output repeatedly.
|
||||
typedef void (*llama_log_callback)(llama_log_level level, const char * text, void * user_data);
|
||||
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
||||
|
||||
struct llama_context_params {
|
||||
uint32_t seed; // RNG seed, -1 for random
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue