updated README.md and Makefile
This commit is contained in:
parent
9604114da0
commit
79be614ea5
2 changed files with 80 additions and 30 deletions
63
Makefile
63
Makefile
|
@ -346,49 +346,52 @@ ifdef LLAMA_MPI
|
||||||
endif # LLAMA_MPI
|
endif # LLAMA_MPI
|
||||||
|
|
||||||
ifdef LLAMA_OPENSHMEM
|
ifdef LLAMA_OPENSHMEM
|
||||||
|
ifndef OPENSHMEM_FOUND
|
||||||
OPENSHMEM_FOUND:=0
|
OSHMEM_PKG:=sandia-openshmem
|
||||||
PKG:=sandia-openshmem
|
OSHMEM_REQPKG:=$(shell pkg-config --exists $(OSHMEM_PKG) && echo '$(OSHMEM_PKG)')
|
||||||
REQPKG:=$(shell pkg-config --exists $(PKG) && echo '$(PKG)')
|
ifneq ($(OSHMEM_REQPKG),)
|
||||||
ifneq ($(REQPKG),)
|
OPENSHMEM_FOUND:=1
|
||||||
OPENSHMEM_FOUND:=1
|
OPENSHMEM_CFLAGS:=$(shell pkg-config --cflags sandia-openshmem)
|
||||||
OPENSHMEM_CFLAGS:=$(shell pkg-config --cflags-only-other sandia-openshmem)
|
OPENSHMEM_LDFLAGS:=$(shell pkg-config --libs sandia-openshmem)
|
||||||
OPENSHMEM_LDFLAGS:=$(shell pkg-config --libs sandia-openshmem)
|
warn := $(warning OpenSHMEM found)
|
||||||
else
|
|
||||||
$(warning '$(PKG)' not found)
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifneq($(OPENSHMEM_FOUND),1)
|
|
||||||
PKG:=osss-ucx
|
|
||||||
REQPKG:=$(shell pkg-config --exists $(PKG) && echo '$(PKG)')
|
|
||||||
ifneq ($(REQPKG),)
|
|
||||||
OPENSHMEM_FOUND:=1
|
|
||||||
OPENSHMEM_CFLAGS:=$(shell pkg-config --cflags-only-other osss-ucx)
|
|
||||||
OPENSHMEM_LDFLAGS:=$(shell pkg-config --libs osss-ucx)
|
|
||||||
else
|
else
|
||||||
$(warning '$(PKG)' not found)
|
$(warning '$(OSHMEM_PKG)' not found)
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifneq($(OPENSHMEM_FOUND),1)
|
ifndef OPENSHMEM_FOUND
|
||||||
PKG:=oshmem
|
OSHMEM_PKG:=osss-ucx
|
||||||
REQPKG:=$(shell pkg-config --exists $(PKG) && echo '$(PKG)')
|
OSHMEM_REQPKG:=$(shell pkg-config --exists $(OSHMEM_PKG) && echo '$(OSHMEM_PKG)')
|
||||||
ifneq ($(REQPKG),)
|
ifneq ($(OSHMEM_REQPKG),)
|
||||||
OPENSHMEM_FOUND:=1
|
OPENSHMEM_FOUND:=1
|
||||||
|
OPENSHMEM_CFLAGS:=$(shell pkg-config --cflags osss-ucx)
|
||||||
|
OPENSHMEM_LDFLAGS:=$(shell pkg-config --libs osss-ucx)
|
||||||
|
warn := $(warning OpenSHMEM found)
|
||||||
|
else
|
||||||
|
$(warning '$(OSHMEM_PKG)' not found)
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifndef OPENSHMEM_FOUND
|
||||||
|
OSHMEM_PKG:=oshmem
|
||||||
|
OSHMEM_REQPKG:=$(shell pkg-config --exists $(OSHMEM_PKG) && echo '$(OSHMEM_PKG)')
|
||||||
|
ifneq ($(OSHMEM_REQPKG),)
|
||||||
|
OPENSHMEM_FOUND:=1
|
||||||
OPENSHMEM_CFLAGS:=$(shell oshmem_info --path libdir)
|
OPENSHMEM_CFLAGS:=$(shell oshmem_info --path libdir)
|
||||||
OPENSHMEM_LDFLAGS:=$(shell oshmem_info --path incdir)
|
OPENSHMEM_LDFLAGS:=$(shell oshmem_info --path incdir)
|
||||||
|
warn := $(warning OpenSHMEM found)
|
||||||
else
|
else
|
||||||
$(warning '$(PKG)' not found)
|
$(warning '$(OSHMEM_PKG)' not found)
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifneq($(OPENSHMEM_FOUND),1)
|
ifndef OPENSHMEM_FOUND
|
||||||
$(error '$(PKG)' not found)
|
$(error OpenSHMEM not found)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
MK_CPPFLAGS += -DGGML_USE_OPENSHMEM
|
MK_CPPFLAGS += -DGGML_USE_OPENSHMEM $(OPENSHMEM_CFLAGS)
|
||||||
MK_CFLAGS += -Wno-cast-qual $(OPENSHMEM_CFLAGS)
|
MK_CFLAGS += -Wno-cast-qual $(OPENSHMEM_CFLAGS)
|
||||||
MK_LDFLAGS += -Wno-cast-qual $(OPENSHMEM_LDFLAGS)
|
MK_LDFLAGS += -Wno-cast-qual $(OPENSHMEM_LDFLAGS)
|
||||||
OBJS += ggml-oshmem.o
|
OBJS += ggml-oshmem.o
|
||||||
endif # LLAMA_OPENSHMEM
|
endif # LLAMA_OPENSHMEM
|
||||||
|
|
||||||
|
|
47
README.md
47
README.md
|
@ -335,6 +335,53 @@ Finally, you're ready to run a computation using `mpirun`:
|
||||||
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
|
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### OpenSHMEM Build
|
||||||
|
|
||||||
|
OpenSHMEM lets you distribute the computation over a cluster of machines using a Partitioned Global Address Space (PGAS). Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
|
||||||
|
|
||||||
|
First you will need the OpenSHMEM libraries installed on your system. There are 3 options: [OpenMPI's OpenSHMEM](https://www.open-mpi.org), [OSSS-OpenSHMEM](https://github.com/openshmem-org/osss-ucx) and [Sandia-OpenSHMEM](https://github.com/Sandia-OpenSHMEM/SOS). OSSS-OpenSHMEM has a dependency on the [UCX](https://github.com/openucx/ucx) communication library. Sandia-OpenSHMEM can run over udp, [UCX](https://github.com/openucx/ucx), or [libfabric](https://github.com/ofiwg/libfabric). OpenMPI's OpenSHMEM can be installed with a package manager (apt, homebrew, etc). UCX, OSSS-OpenSHMEM, and Sandia-OpenSHMEM can all be installed from source.
|
||||||
|
|
||||||
|
Next you will need to build the project with `LLAMA_OPENSHMEM` set to true on all machines; if you're building with `make`, you will also need to specify an OpenSHMEM-capable compiler (when building with CMake, this is configured automatically):
|
||||||
|
|
||||||
|
- Using `make`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make CC=oshcc CXX=oshc++ LLAMA_MPI=1
|
||||||
|
```
|
||||||
|
|
||||||
|
- Using `CMake`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cmake -S . -B build -DLLAMA_MPI=ON -DCMAKE_C_COMPILER=oshcc -DCMAKE_CXX_COMPILER=oshc++
|
||||||
|
```
|
||||||
|
|
||||||
|
If you have access to a distributed file system (NFS) it's suggested you copy the programs and weights onto the distributed file system.
|
||||||
|
|
||||||
|
Additionally, if you have a cluster with a bulk-synchronous scheduler ie: (Slurm)[https://slurm.schedmd.com] all you need to do is run the program from the distributed file system using the bulk-synchronous scheduler. The following example assumes a slurm cluster. The example additionally assumes an NFS installation wth the distributed file system mounted with the following path on all machines: `/nfs_path`.
|
||||||
|
|
||||||
|
```
|
||||||
|
srun -n 2 /nfs_path/main -m /nfs_path/models/7B/ggml-model-q4_0.gguf -n 128
|
||||||
|
```
|
||||||
|
|
||||||
|
If you do not have access to a cluster with a bulk-synchronous scheduler or a distributed file system, the following instructions will help you stage an installation and run the application. Build the programs, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
|
||||||
|
|
||||||
|
Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
|
||||||
|
|
||||||
|
Here is an example hostfile:
|
||||||
|
|
||||||
|
```
|
||||||
|
192.168.0.1:1
|
||||||
|
malvolio.local:1
|
||||||
|
```
|
||||||
|
|
||||||
|
The above will distribute the computation across 1 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive. It is a requirement of OpenSHMEM that the distributed job be performed over a number of machines that is equal to a power of 2.
|
||||||
|
|
||||||
|
Finally, you're ready to run a computation using `mpirun`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
oshrun -hostfile hostfile -n 2 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
|
||||||
|
```
|
||||||
|
|
||||||
### BLAS Build
|
### BLAS Build
|
||||||
|
|
||||||
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
|
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue