llama : remove LLAMA_SUPPORTS_GPU_OFFLOAD
ggml-ci
This commit is contained in:
parent
3180a6468f
commit
8bfb0b6a64
4 changed files with 74 additions and 57 deletions
|
@ -583,20 +583,20 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_gpu_layers = std::stoi(argv[i]);
|
params.n_gpu_layers = std::stoi(argv[i]);
|
||||||
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
|
if (!llama_supports_gpu_offload()) {
|
||||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||||
#endif
|
}
|
||||||
} else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
|
} else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_gpu_layers_draft = std::stoi(argv[i]);
|
params.n_gpu_layers_draft = std::stoi(argv[i]);
|
||||||
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
|
if (!llama_supports_gpu_offload()) {
|
||||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
|
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
|
||||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||||
#endif
|
}
|
||||||
} else if (arg == "--main-gpu" || arg == "-mg") {
|
} else if (arg == "--main-gpu" || arg == "-mg") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -989,30 +989,30 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
|
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
|
||||||
printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
|
printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
|
||||||
if (llama_mlock_supported()) {
|
if (llama_supports_mlock()) {
|
||||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||||
}
|
}
|
||||||
if (llama_mmap_supported()) {
|
if (llama_supports_mmap()) {
|
||||||
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||||
}
|
}
|
||||||
printf(" --numa attempt optimizations that help on some NUMA systems\n");
|
printf(" --numa attempt optimizations that help on some NUMA systems\n");
|
||||||
printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
|
printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
|
||||||
printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
|
printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
|
||||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
if (llama_supports_gpu_offload()) {
|
||||||
printf(" -ngl N, --n-gpu-layers N\n");
|
printf(" -ngl N, --n-gpu-layers N\n");
|
||||||
printf(" number of layers to store in VRAM\n");
|
printf(" number of layers to store in VRAM\n");
|
||||||
printf(" -ngld N, --n-gpu-layers-draft N\n");
|
printf(" -ngld N, --n-gpu-layers-draft N\n");
|
||||||
printf(" number of layers to store in VRAM for the draft model\n");
|
printf(" number of layers to store in VRAM for the draft model\n");
|
||||||
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
|
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
|
||||||
printf(" how to split the model across multiple GPUs, one of:\n");
|
printf(" how to split the model across multiple GPUs, one of:\n");
|
||||||
printf(" - none: use one GPU only\n");
|
printf(" - none: use one GPU only\n");
|
||||||
printf(" - layer (default): split layers and KV across GPUs\n");
|
printf(" - layer (default): split layers and KV across GPUs\n");
|
||||||
printf(" - row: split rows across GPUs\n");
|
printf(" - row: split rows across GPUs\n");
|
||||||
printf(" -ts SPLIT, --tensor-split SPLIT\n");
|
printf(" -ts SPLIT, --tensor-split SPLIT\n");
|
||||||
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||||
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
||||||
#endif // LLAMA_SUPPORTS_GPU_OFFLOAD
|
}
|
||||||
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
||||||
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
||||||
printf(" -gan N, --grp-attn-n N\n");
|
printf(" -gan N, --grp-attn-n N\n");
|
||||||
|
|
|
@ -1789,28 +1789,28 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||||
if (llama_mlock_supported())
|
if (llama_supports_mlock())
|
||||||
{
|
{
|
||||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||||
}
|
}
|
||||||
if (llama_mmap_supported())
|
if (llama_supports_mmap())
|
||||||
{
|
{
|
||||||
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||||
}
|
}
|
||||||
printf(" --numa attempt optimizations that help on some NUMA systems\n");
|
printf(" --numa attempt optimizations that help on some NUMA systems\n");
|
||||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
if (llama_supports_gpu_offload()) {
|
||||||
printf(" -ngl N, --n-gpu-layers N\n");
|
printf(" -ngl N, --n-gpu-layers N\n");
|
||||||
printf(" number of layers to store in VRAM\n");
|
printf(" number of layers to store in VRAM\n");
|
||||||
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
|
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
|
||||||
printf(" how to split the model across multiple GPUs, one of:\n");
|
printf(" how to split the model across multiple GPUs, one of:\n");
|
||||||
printf(" - none: use one GPU only\n");
|
printf(" - none: use one GPU only\n");
|
||||||
printf(" - layer (default): split layers and KV across GPUs\n");
|
printf(" - layer (default): split layers and KV across GPUs\n");
|
||||||
printf(" - row: split rows across GPUs\n");
|
printf(" - row: split rows across GPUs\n");
|
||||||
printf(" -ts SPLIT --tensor-split SPLIT\n");
|
printf(" -ts SPLIT --tensor-split SPLIT\n");
|
||||||
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||||
printf(" or for intermediate results and KV (with split-mode = row)\n");
|
printf(" or for intermediate results and KV (with split-mode = row)\n");
|
||||||
#endif
|
}
|
||||||
printf(" -m FNAME, --model FNAME\n");
|
printf(" -m FNAME, --model FNAME\n");
|
||||||
printf(" model path (default: %s)\n", params.model.c_str());
|
printf(" model path (default: %s)\n", params.model.c_str());
|
||||||
printf(" -a ALIAS, --alias ALIAS\n");
|
printf(" -a ALIAS, --alias ALIAS\n");
|
||||||
|
@ -2066,13 +2066,13 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
if (llama_supports_gpu_offload()) {
|
||||||
params.n_gpu_layers = std::stoi(argv[i]);
|
params.n_gpu_layers = std::stoi(argv[i]);
|
||||||
#else
|
} else {
|
||||||
LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
|
LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
|
||||||
"See main README.md for information on enabling GPU BLAS support",
|
"See main README.md for information on enabling GPU BLAS support",
|
||||||
{{"n_gpu_layers", params.n_gpu_layers}});
|
{{"n_gpu_layers", params.n_gpu_layers}});
|
||||||
#endif
|
}
|
||||||
}
|
}
|
||||||
else if (arg == "--split-mode" || arg == "-sm")
|
else if (arg == "--split-mode" || arg == "-sm")
|
||||||
{
|
{
|
||||||
|
|
27
llama.cpp
27
llama.cpp
|
@ -10102,14 +10102,33 @@ size_t llama_max_devices(void) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_mmap_supported(void) {
|
bool llama_supports_mmap(void) {
|
||||||
return llama_mmap::SUPPORTED;
|
return llama_mmap::SUPPORTED;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_mlock_supported(void) {
|
bool llama_supports_mlock(void) {
|
||||||
return llama_mlock::SUPPORTED;
|
return llama_mlock::SUPPORTED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llama_supports_gpu_offload(void) {
|
||||||
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
||||||
|
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
||||||
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
|
return true;
|
||||||
|
#else
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// deprecated:
|
||||||
|
bool llama_mmap_supported(void) {
|
||||||
|
return llama_supports_mmap();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llama_mlock_supported(void) {
|
||||||
|
return llama_supports_mlock();
|
||||||
|
}
|
||||||
|
|
||||||
void llama_backend_init(bool numa) {
|
void llama_backend_init(bool numa) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
|
@ -10141,8 +10160,8 @@ int64_t llama_time_us(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model * llama_load_model_from_file(
|
struct llama_model * llama_load_model_from_file(
|
||||||
const char * path_model,
|
const char * path_model,
|
||||||
struct llama_model_params params) {
|
struct llama_model_params params) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
llama_model * model = new llama_model;
|
llama_model * model = new llama_model;
|
||||||
|
|
14
llama.h
14
llama.h
|
@ -46,12 +46,6 @@
|
||||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
#define LLAMA_SESSION_VERSION 4
|
#define LLAMA_SESSION_VERSION 4
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
|
||||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
|
||||||
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
@ -337,8 +331,12 @@ extern "C" {
|
||||||
|
|
||||||
LLAMA_API size_t llama_max_devices(void);
|
LLAMA_API size_t llama_max_devices(void);
|
||||||
|
|
||||||
LLAMA_API bool llama_mmap_supported (void);
|
LLAMA_API bool llama_supports_mmap (void);
|
||||||
LLAMA_API bool llama_mlock_supported(void);
|
LLAMA_API bool llama_supports_mlock (void);
|
||||||
|
LLAMA_API bool llama_supports_gpu_offload(void);
|
||||||
|
|
||||||
|
LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
|
||||||
|
LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
|
||||||
|
|
||||||
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue