ggml: Added OpenMP for multi-threads processing

This commit is contained in:
msy-kato 2024-05-29 13:33:37 +09:00
parent 504f0c340f
commit e9a70b10c2
3 changed files with 73 additions and 36 deletions

View file

@ -125,6 +125,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)") set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
option(LLAMA_KOMPUTE "llama: use Kompute" OFF) option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
option(LLAMA_RPC "llama: use RPC" OFF) option(LLAMA_RPC "llama: use RPC" OFF)
option(LLAMA_OPENMP "llama: use OpenMP" OFF)
option(LLAMA_SYCL "llama: use SYCL" OFF) option(LLAMA_SYCL "llama: use SYCL" OFF)
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device") set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
@ -295,6 +296,17 @@ if (LLAMA_METAL)
) )
endif() endif()
if (LLAMA_OPENMP)
find_package(OpenMP)
if(OpenMP_FOUND)
message(STATUS "OpenMP found")
add_compile_definitions(GGML_USE_OPENMP)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
endif()
endif()
if (LLAMA_BLAS) if (LLAMA_BLAS)
if (LLAMA_STATIC) if (LLAMA_STATIC)
set(BLA_STATIC ON) set(BLA_STATIC ON)

View file

@ -400,6 +400,12 @@ ifndef LLAMA_NO_ACCELERATE
endif endif
endif # LLAMA_NO_ACCELERATE endif # LLAMA_NO_ACCELERATE
ifdef LLAMA_OPENMP
MK_CPPFLAGS += -DGGML_USE_OPENMP
MK_CFLAGS += -fopenmp
MK_CXXFLAGS += -fopenmp
endif # LLAMA_OPENMP
ifdef LLAMA_OPENBLAS ifdef LLAMA_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas) MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)

91
ggml.c
View file

@ -28,6 +28,10 @@
#include <syscall.h> #include <syscall.h>
#endif #endif
#ifdef GGML_USE_OPENMP
#include <omp.h>
#endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
#include <unistd.h> #include <unistd.h>
#endif #endif
@ -19661,6 +19665,48 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
return cplan; return cplan;
} }
static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads){
enum ggml_status compute_status = GGML_STATUS_SUCCESS;
#ifdef GGML_USE_OPENMP
#pragma omp parallel num_threads(n_threads)
{
ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
}
#else
// create thread pool
if (n_threads > 1) {
for (int j = 1; j < n_threads; ++j) {
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
GGML_ASSERT(rc == 0);
UNUSED(rc);
}
}
// this is a work thread too
ggml_graph_compute_thread(&workers[0]);
// don't leave affinity set on the main thread
clear_numa_thread_affinity();
// join or kill thread pool
if (n_threads > 1) {
for (int j = 1; j < n_threads; j++) {
const int rc = ggml_thread_join(workers[j].thrd, NULL);
GGML_ASSERT(rc == 0);
UNUSED(rc);
}
}
#endif
for (int j = 0; j < n_threads; j++) {
if (workers[j].ec != GGML_STATUS_SUCCESS) {
compute_status = workers[j].ec;
break;
}
}
return compute_status;
}
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
{ {
GGML_ASSERT(cplan); GGML_ASSERT(cplan);
@ -19687,47 +19733,20 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
/*.current_chunk; =*/ 0, /*.current_chunk; =*/ 0,
}; };
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
// create thread pool
if (n_threads > 1) {
for (int j = 1; j < n_threads; ++j) {
workers[j] = (struct ggml_compute_state) {
.thrd = 0,
.ith = j,
.shared = &state_shared,
.ec = GGML_STATUS_SUCCESS,
};
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
GGML_ASSERT(rc == 0);
UNUSED(rc);
}
}
workers[0].ith = 0;
workers[0].shared = &state_shared;
workers[0].ec = GGML_STATUS_SUCCESS;
const int64_t perf_start_cycles = ggml_perf_cycles(); const int64_t perf_start_cycles = ggml_perf_cycles();
const int64_t perf_start_time_us = ggml_perf_time_us(); const int64_t perf_start_time_us = ggml_perf_time_us();
// this is a work thread too for (int j = 0; j < n_threads; ++j) {
ggml_graph_compute_thread(&workers[0]); workers[j] = (struct ggml_compute_state) {
enum ggml_status compute_status = workers[0].ec; .thrd = 0,
.ith = j,
// don't leave affinity set on the main thread .shared = &state_shared,
clear_numa_thread_affinity(); .ec = GGML_STATUS_SUCCESS,
};
// join or kill thread pool
if (n_threads > 1) {
for (int j = 1; j < n_threads; j++) {
const int rc = ggml_thread_join(workers[j].thrd, NULL);
GGML_ASSERT(rc == 0);
if (workers[j].ec != GGML_STATUS_SUCCESS)
compute_status = workers[j].ec;
}
} }
enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads);
// performance stats (graph) // performance stats (graph)
{ {
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles; int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;