OpenMP: remove repetitive thread creation using OpenMP
- Add OpenMP to dependency detection - Add GGML_NO_OMP if OpenMP is disabled or not found - Keep previous approach if there is no OMP
This commit is contained in:
parent
1ea2a0036e
commit
47edd35d78
3 changed files with 72 additions and 27 deletions
|
@ -123,6 +123,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
|||
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
|
||||
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||
option(LLAMA_DISABLE_OMP "Disable OpenMP support" OFF)
|
||||
option(LLAMA_RPC "llama: use RPC" OFF)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
||||
|
@ -1242,6 +1243,25 @@ if (BUILD_SHARED_LIBS)
|
|||
install(TARGETS ggml_shared LIBRARY)
|
||||
endif()
|
||||
|
||||
|
||||
if (NOT LLAMA_DISABLE_OMP)
|
||||
find_package(OpenMP)
|
||||
if(OpenMP_CXX_FOUND)
|
||||
message(STATUS "OpenMP found")
|
||||
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
|
||||
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
|
||||
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
|
||||
target_link_libraries(ggml PUBLIC OpenMP::OpenMP_CXX)
|
||||
else()
|
||||
message(STATUS "OpenMP NOT found activating standard threading")
|
||||
add_compile_definitions(GGML_NO_OMP)
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "OpenMP disabled activating standard threading")
|
||||
add_compile_definitions(GGML_NO_OMP)
|
||||
endif()
|
||||
|
||||
|
||||
# llama
|
||||
|
||||
add_library(llama
|
||||
|
|
8
Makefile
8
Makefile
|
@ -406,6 +406,14 @@ ifdef LLAMA_MPI
|
|||
OBJS += ggml-mpi.o
|
||||
endif # LLAMA_MPI
|
||||
|
||||
ifndef LLAMA_NO_OMP
|
||||
MK_CPPFLAGS+= -fopenmp
|
||||
MK_CFLAGS+= -fopenmp
|
||||
MK_LDFLAGS+= -fopenmp
|
||||
else
|
||||
MK_CPPFLAGS+= -DGGML_NO_OMP
|
||||
endif
|
||||
|
||||
ifdef LLAMA_OPENBLAS
|
||||
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
|
||||
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
||||
|
|
71
ggml.c
71
ggml.c
|
@ -19334,8 +19334,12 @@ typedef int ggml_lock_t;
|
|||
|
||||
#endif
|
||||
|
||||
#ifdef GGML_NO_OMP
|
||||
|
||||
|
||||
// Android's libc implementation "bionic" does not support setting affinity
|
||||
#if defined(__gnu_linux__)
|
||||
|
||||
static void set_numa_thread_affinity(int thread_n) {
|
||||
if (!ggml_is_numa()) {
|
||||
return;
|
||||
|
@ -19401,11 +19405,16 @@ static void clear_numa_thread_affinity(void) {
|
|||
|
||||
CPU_FREE(cpus);
|
||||
}
|
||||
|
||||
#else
|
||||
// TODO: Windows etc.
|
||||
// (the linux implementation may also work on BSD, someone should test)
|
||||
static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
|
||||
static void clear_numa_thread_affinity(void) {}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
||||
|
@ -19713,7 +19722,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
|
||||
const int n_threads = state->shared->n_threads;
|
||||
|
||||
#ifdef GGML_NO_OMP
|
||||
set_numa_thread_affinity(state->ith);
|
||||
#endif
|
||||
|
||||
int node_n = -1;
|
||||
int task_phase = GGML_TASK_TYPE_FINALIZE;
|
||||
|
@ -20086,44 +20097,50 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|||
};
|
||||
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
||||
|
||||
// create thread pool
|
||||
if (n_threads > 1) {
|
||||
for (int j = 1; j < n_threads; ++j) {
|
||||
workers[j] = (struct ggml_compute_state) {
|
||||
.thrd = 0,
|
||||
.ith = j,
|
||||
.shared = &state_shared,
|
||||
.ec = GGML_STATUS_SUCCESS,
|
||||
};
|
||||
const int64_t perf_start_cycles = ggml_perf_cycles();
|
||||
const int64_t perf_start_time_us = ggml_perf_time_us();
|
||||
|
||||
/* Loop is reversed as in the NO_OMP case we want threads to start
|
||||
before the main thread (j==0) */
|
||||
#pragma omp parallel for shared(workers,state_shared)
|
||||
for (int j = n_threads - 1; 0 <= j; j--) {
|
||||
workers[j] = (struct ggml_compute_state) {
|
||||
.ith = j,
|
||||
.shared = &state_shared,
|
||||
.ec = GGML_STATUS_SUCCESS,
|
||||
};
|
||||
|
||||
#ifdef GGML_NO_OMP
|
||||
if(j == 0)
|
||||
{
|
||||
/* No need to spawn a thread for main */
|
||||
ggml_graph_compute_thread(&workers[j]);
|
||||
}
|
||||
else
|
||||
{
|
||||
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
||||
GGML_ASSERT(rc == 0);
|
||||
UNUSED(rc);
|
||||
}
|
||||
#else
|
||||
ggml_graph_compute_thread(&workers[j]);
|
||||
#endif
|
||||
}
|
||||
|
||||
workers[0].ith = 0;
|
||||
workers[0].shared = &state_shared;
|
||||
workers[0].ec = GGML_STATUS_SUCCESS;
|
||||
#ifdef GGML_NO_OMP
|
||||
clear_numa_thread_affinity();
|
||||
#endif
|
||||
|
||||
const int64_t perf_start_cycles = ggml_perf_cycles();
|
||||
const int64_t perf_start_time_us = ggml_perf_time_us();
|
||||
|
||||
// this is a work thread too
|
||||
ggml_graph_compute_thread(&workers[0]);
|
||||
enum ggml_status compute_status = workers[0].ec;
|
||||
|
||||
// don't leave affinity set on the main thread
|
||||
clear_numa_thread_affinity();
|
||||
|
||||
// join or kill thread pool
|
||||
if (n_threads > 1) {
|
||||
for (int j = 1; j < n_threads; j++) {
|
||||
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
||||
GGML_ASSERT(rc == 0);
|
||||
if (workers[j].ec != GGML_STATUS_SUCCESS)
|
||||
compute_status = workers[j].ec;
|
||||
}
|
||||
for (int j = 1; j < n_threads; j++) {
|
||||
#ifdef GGML_NO_OMP
|
||||
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
||||
GGML_ASSERT(rc == 0);
|
||||
#endif
|
||||
if (workers[j].ec != GGML_STATUS_SUCCESS)
|
||||
compute_status = workers[j].ec;
|
||||
}
|
||||
|
||||
// performance stats (graph)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue