diff --git a/Makefile b/Makefile index 0de6b579e..fcb4db976 100644 --- a/Makefile +++ b/Makefile @@ -346,49 +346,52 @@ ifdef LLAMA_MPI endif # LLAMA_MPI ifdef LLAMA_OPENSHMEM - - OPENSHMEM_FOUND:=0 - PKG:=sandia-openshmem - REQPKG:=$(shell pkg-config --exists $(PKG) && echo '$(PKG)') - ifneq ($(REQPKG),) - OPENSHMEM_FOUND:=1 - OPENSHMEM_CFLAGS:=$(shell pkg-config --cflags-only-other sandia-openshmem) - OPENSHMEM_LDFLAGS:=$(shell pkg-config --libs sandia-openshmem) - else - $(warning '$(PKG)' not found) - endif - - ifneq($(OPENSHMEM_FOUND),1) - PKG:=osss-ucx - REQPKG:=$(shell pkg-config --exists $(PKG) && echo '$(PKG)') - ifneq ($(REQPKG),) - OPENSHMEM_FOUND:=1 - OPENSHMEM_CFLAGS:=$(shell pkg-config --cflags-only-other osss-ucx) - OPENSHMEM_LDFLAGS:=$(shell pkg-config --libs osss-ucx) + ifndef OPENSHMEM_FOUND + OSHMEM_PKG:=sandia-openshmem + OSHMEM_REQPKG:=$(shell pkg-config --exists $(OSHMEM_PKG) && echo '$(OSHMEM_PKG)') + ifneq ($(OSHMEM_REQPKG),) + OPENSHMEM_FOUND:=1 + OPENSHMEM_CFLAGS:=$(shell pkg-config --cflags sandia-openshmem) + OPENSHMEM_LDFLAGS:=$(shell pkg-config --libs sandia-openshmem) + warn := $(warning OpenSHMEM found) else - $(warning '$(PKG)' not found) + $(warning '$(OSHMEM_PKG)' not found) endif endif - ifneq($(OPENSHMEM_FOUND),1) - PKG:=oshmem - REQPKG:=$(shell pkg-config --exists $(PKG) && echo '$(PKG)') - ifneq ($(REQPKG),) - OPENSHMEM_FOUND:=1 + ifndef OPENSHMEM_FOUND + OSHMEM_PKG:=osss-ucx + OSHMEM_REQPKG:=$(shell pkg-config --exists $(OSHMEM_PKG) && echo '$(OSHMEM_PKG)') + ifneq ($(OSHMEM_REQPKG),) + OPENSHMEM_FOUND:=1 + OPENSHMEM_CFLAGS:=$(shell pkg-config --cflags osss-ucx) + OPENSHMEM_LDFLAGS:=$(shell pkg-config --libs osss-ucx) + warn := $(warning OpenSHMEM found) + else + $(warning '$(OSHMEM_PKG)' not found) + endif + endif + + ifndef OPENSHMEM_FOUND + OSHMEM_PKG:=oshmem + OSHMEM_REQPKG:=$(shell pkg-config --exists $(OSHMEM_PKG) && echo '$(OSHMEM_PKG)') + ifneq ($(OSHMEM_REQPKG),) + OPENSHMEM_FOUND:=1 OPENSHMEM_CFLAGS:=$(shell oshmem_info --path libdir) OPENSHMEM_LDFLAGS:=$(shell oshmem_info --path incdir) + warn := $(warning OpenSHMEM found) else - $(warning '$(PKG)' not found) + $(warning '$(OSHMEM_PKG)' not found) endif endif - ifneq($(OPENSHMEM_FOUND),1) - $(error '$(PKG)' not found) + ifndef OPENSHMEM_FOUND + $(error OpenSHMEM not found) endif - MK_CPPFLAGS += -DGGML_USE_OPENSHMEM + MK_CPPFLAGS += -DGGML_USE_OPENSHMEM $(OPENSHMEM_CFLAGS) MK_CFLAGS += -Wno-cast-qual $(OPENSHMEM_CFLAGS) - MK_LDFLAGS += -Wno-cast-qual $(OPENSHMEM_LDFLAGS) + MK_LDFLAGS += -Wno-cast-qual $(OPENSHMEM_LDFLAGS) OBJS += ggml-oshmem.o endif # LLAMA_OPENSHMEM diff --git a/README.md b/README.md index 01aef2afc..a3ad2530f 100644 --- a/README.md +++ b/README.md @@ -335,6 +335,53 @@ Finally, you're ready to run a computation using `mpirun`: mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 ``` +### OpenSHMEM Build + +OpenSHMEM lets you distribute the computation over a cluster of machines using a Partitioned Global Address Space (PGAS). Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine. + +First you will need the OpenSHMEM libraries installed on your system. There are 3 options: [OpenMPI's OpenSHMEM](https://www.open-mpi.org), [OSSS-OpenSHMEM](https://github.com/openshmem-org/osss-ucx) and [Sandia-OpenSHMEM](https://github.com/Sandia-OpenSHMEM/SOS). OSSS-OpenSHMEM has a dependency on the [UCX](https://github.com/openucx/ucx) communication library. Sandia-OpenSHMEM can run over udp, [UCX](https://github.com/openucx/ucx), or [libfabric](https://github.com/ofiwg/libfabric). OpenMPI's OpenSHMEM can be installed with a package manager (apt, homebrew, etc). UCX, OSSS-OpenSHMEM, and Sandia-OpenSHMEM can all be installed from source. + +Next you will need to build the project with `LLAMA_OPENSHMEM` set to true on all machines; if you're building with `make`, you will also need to specify an OpenSHMEM-capable compiler (when building with CMake, this is configured automatically): + +- Using `make`: + + ```bash + make CC=oshcc CXX=oshc++ LLAMA_MPI=1 + ``` + +- Using `CMake`: + + ```bash + cmake -S . -B build -DLLAMA_MPI=ON -DCMAKE_C_COMPILER=oshcc -DCMAKE_CXX_COMPILER=oshc++ + ``` + +If you have access to a distributed file system (NFS) it's suggested you copy the programs and weights onto the distributed file system. + +Additionally, if you have a cluster with a bulk-synchronous scheduler ie: (Slurm)[https://slurm.schedmd.com] all you need to do is run the program from the distributed file system using the bulk-synchronous scheduler. The following example assumes a slurm cluster. The example additionally assumes an NFS installation wth the distributed file system mounted with the following path on all machines: `/nfs_path`. + +``` +srun -n 2 /nfs_path/main -m /nfs_path/models/7B/ggml-model-q4_0.gguf -n 128 +``` + +If you do not have access to a cluster with a bulk-synchronous scheduler or a distributed file system, the following instructions will help you stage an installation and run the application. Build the programs, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines. + +Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost". + +Here is an example hostfile: + +``` +192.168.0.1:1 +malvolio.local:1 +``` + +The above will distribute the computation across 1 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive. It is a requirement of OpenSHMEM that the distributed job be performed over a number of machines that is equal to a power of 2. + +Finally, you're ready to run a computation using `mpirun`: + +```bash +oshrun -hostfile hostfile -n 2 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 +``` + ### BLAS Build Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use: