ggml : add support for dynamic loading of backends (#10469)
* ggml : add support for dynamic loading of backends --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
f6d12e7df8
commit
5931c1f233
44 changed files with 728 additions and 272 deletions
|
@ -12,13 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
|||
|
||||
if (EMSCRIPTEN)
|
||||
else()
|
||||
add_subdirectory(cvector-generator)
|
||||
add_subdirectory(batched-bench)
|
||||
add_subdirectory(batched)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(eval-callback)
|
||||
add_subdirectory(export-lora)
|
||||
add_subdirectory(gbnf-validator)
|
||||
add_subdirectory(gguf-hash)
|
||||
add_subdirectory(gguf-split)
|
||||
|
@ -27,24 +24,16 @@ else()
|
|||
add_subdirectory(imatrix)
|
||||
add_subdirectory(infill)
|
||||
add_subdirectory(llama-bench)
|
||||
add_subdirectory(llava)
|
||||
add_subdirectory(lookahead)
|
||||
add_subdirectory(lookup)
|
||||
add_subdirectory(main)
|
||||
add_subdirectory(parallel)
|
||||
add_subdirectory(passkey)
|
||||
add_subdirectory(perplexity)
|
||||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(retrieval)
|
||||
if (GGML_RPC)
|
||||
add_subdirectory(rpc)
|
||||
endif()
|
||||
if (LLAMA_BUILD_SERVER)
|
||||
add_subdirectory(server)
|
||||
endif()
|
||||
if (GGML_SYCL)
|
||||
add_subdirectory(sycl)
|
||||
add_subdirectory(server)
|
||||
endif()
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(simple)
|
||||
|
@ -52,4 +41,18 @@ else()
|
|||
add_subdirectory(speculative)
|
||||
add_subdirectory(speculative-simple)
|
||||
add_subdirectory(tokenize)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
# these examples use the backends directly and cannot be built with dynamic loading
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(cvector-generator)
|
||||
add_subdirectory(export-lora)
|
||||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(llava)
|
||||
if (GGML_RPC)
|
||||
add_subdirectory(rpc)
|
||||
endif()
|
||||
if (GGML_SYCL)
|
||||
add_subdirectory(sycl)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
|
|
@ -5,5 +5,6 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
set(TEST_TARGET test-eval-callback)
|
||||
add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
||||
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
||||
|
|
|
@ -1477,6 +1477,17 @@ int main(int argc, char ** argv) {
|
|||
|
||||
cmd_params params = parse_cmd_params(argc, argv);
|
||||
|
||||
// initialize backends
|
||||
ggml_backend_load_all();
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!cpu_dev) {
|
||||
fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
||||
auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
|
||||
auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
|
||||
|
||||
// initialize llama.cpp
|
||||
if (!params.verbose) {
|
||||
llama_log_set(llama_null_log_callback, NULL);
|
||||
|
@ -1551,7 +1562,7 @@ int main(int argc, char ** argv) {
|
|||
tpp.poll = t.poll;
|
||||
tpp.prio = params.prio;
|
||||
|
||||
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
|
||||
struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
|
||||
if (!threadpool) {
|
||||
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||
exit(1);
|
||||
|
@ -1612,7 +1623,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
llama_free(ctx);
|
||||
|
||||
ggml_threadpool_free(threadpool);
|
||||
ggml_threadpool_free_fn(threadpool);
|
||||
}
|
||||
|
||||
llama_free_model(lmodel);
|
||||
|
|
|
@ -165,6 +165,10 @@ int main(int argc, char ** argv) {
|
|||
|
||||
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
|
||||
|
||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
||||
auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
|
||||
auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
|
||||
|
||||
struct ggml_threadpool_params tpp_batch =
|
||||
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
|
||||
struct ggml_threadpool_params tpp =
|
||||
|
@ -174,7 +178,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
struct ggml_threadpool * threadpool_batch = NULL;
|
||||
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
|
||||
threadpool_batch = ggml_threadpool_new(&tpp_batch);
|
||||
threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
|
||||
if (!threadpool_batch) {
|
||||
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
|
||||
return 1;
|
||||
|
@ -184,7 +188,7 @@ int main(int argc, char ** argv) {
|
|||
tpp.paused = true;
|
||||
}
|
||||
|
||||
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
|
||||
struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
|
||||
if (!threadpool) {
|
||||
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||
return 1;
|
||||
|
@ -890,8 +894,8 @@ int main(int argc, char ** argv) {
|
|||
|
||||
llama_backend_free();
|
||||
|
||||
ggml_threadpool_free(threadpool);
|
||||
ggml_threadpool_free(threadpool_batch);
|
||||
ggml_threadpool_free_fn(threadpool);
|
||||
ggml_threadpool_free_fn(threadpool_batch);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -62,6 +62,9 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}, nullptr);
|
||||
|
||||
// load dynamic backends
|
||||
ggml_backend_load_all();
|
||||
|
||||
// initialize the model
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.n_gpu_layers = ngl;
|
||||
|
|
|
@ -74,6 +74,10 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
// load dynamic backends
|
||||
|
||||
ggml_backend_load_all();
|
||||
|
||||
// initialize the model
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue