diff --git a/Makefile b/Makefile
index 0de6b579e..fcb4db976 100644
--- a/Makefile
+++ b/Makefile
@@ -346,49 +346,52 @@ ifdef LLAMA_MPI
 endif # LLAMA_MPI
 
 ifdef LLAMA_OPENSHMEM
-
-        OPENSHMEM_FOUND:=0
-	PKG:=sandia-openshmem
-	REQPKG:=$(shell pkg-config --exists $(PKG) && echo '$(PKG)')
-	ifneq ($(REQPKG),)
-                OPENSHMEM_FOUND:=1
-		OPENSHMEM_CFLAGS:=$(shell pkg-config --cflags-only-other sandia-openshmem)
-		OPENSHMEM_LDFLAGS:=$(shell pkg-config --libs sandia-openshmem)
-	else
-		$(warning '$(PKG)' not found)
-	endif
-
-        ifneq($(OPENSHMEM_FOUND),1)
-		PKG:=osss-ucx
-		REQPKG:=$(shell pkg-config --exists $(PKG) && echo '$(PKG)')
-		ifneq ($(REQPKG),)
-        	        OPENSHMEM_FOUND:=1
-			OPENSHMEM_CFLAGS:=$(shell pkg-config --cflags-only-other osss-ucx)
-			OPENSHMEM_LDFLAGS:=$(shell pkg-config --libs osss-ucx)
+	ifndef OPENSHMEM_FOUND
+		OSHMEM_PKG:=sandia-openshmem
+		OSHMEM_REQPKG:=$(shell pkg-config --exists $(OSHMEM_PKG) && echo '$(OSHMEM_PKG)')
+		ifneq ($(OSHMEM_REQPKG),)
+			OPENSHMEM_FOUND:=1
+			OPENSHMEM_CFLAGS:=$(shell pkg-config --cflags sandia-openshmem)
+			OPENSHMEM_LDFLAGS:=$(shell pkg-config --libs sandia-openshmem)
+			warn := $(warning OpenSHMEM found)
 		else
-			$(warning '$(PKG)' not found)
+			$(warning '$(OSHMEM_PKG)' not found)
 		endif
 	endif
 
-        ifneq($(OPENSHMEM_FOUND),1)
-		PKG:=oshmem
-		REQPKG:=$(shell pkg-config --exists $(PKG) && echo '$(PKG)')
-		ifneq ($(REQPKG),)
-        	        OPENSHMEM_FOUND:=1
+	ifndef OPENSHMEM_FOUND
+		OSHMEM_PKG:=osss-ucx
+		OSHMEM_REQPKG:=$(shell pkg-config --exists $(OSHMEM_PKG) && echo '$(OSHMEM_PKG)')
+		ifneq ($(OSHMEM_REQPKG),)
+			OPENSHMEM_FOUND:=1
+			OPENSHMEM_CFLAGS:=$(shell pkg-config --cflags osss-ucx)
+			OPENSHMEM_LDFLAGS:=$(shell pkg-config --libs osss-ucx)
+			warn := $(warning OpenSHMEM found)
+		else
+			$(warning '$(OSHMEM_PKG)' not found)
+		endif
+	endif
+
+	ifndef OPENSHMEM_FOUND
+		OSHMEM_PKG:=oshmem
+		OSHMEM_REQPKG:=$(shell pkg-config --exists $(OSHMEM_PKG) && echo '$(OSHMEM_PKG)')
+		ifneq ($(OSHMEM_REQPKG),)
+			OPENSHMEM_FOUND:=1
 			OPENSHMEM_CFLAGS:=$(shell oshmem_info --path libdir)
 			OPENSHMEM_LDFLAGS:=$(shell oshmem_info --path incdir)
+			warn := $(warning OpenSHMEM found)
 		else
-			$(warning '$(PKG)' not found)
+			$(warning '$(OSHMEM_PKG)' not found)
 		endif
 	endif
 
-        ifneq($(OPENSHMEM_FOUND),1)
-		$(error '$(PKG)' not found)
+	ifndef OPENSHMEM_FOUND
+		$(error OpenSHMEM not found)
 	endif
 
-	MK_CPPFLAGS += -DGGML_USE_OPENSHMEM
+	MK_CPPFLAGS += -DGGML_USE_OPENSHMEM $(OPENSHMEM_CFLAGS)
 	MK_CFLAGS   += -Wno-cast-qual $(OPENSHMEM_CFLAGS)
-	MK_LDFLAGS += -Wno-cast-qual $(OPENSHMEM_LDFLAGS)
+	MK_LDFLAGS  += -Wno-cast-qual $(OPENSHMEM_LDFLAGS)
 	OBJS        += ggml-oshmem.o
 endif # LLAMA_OPENSHMEM
 
diff --git a/README.md b/README.md
index 01aef2afc..a3ad2530f 100644
--- a/README.md
+++ b/README.md
@@ -335,6 +335,53 @@ Finally, you're ready to run a computation using `mpirun`:
 mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ```
 
+### OpenSHMEM Build
+
+OpenSHMEM lets you distribute the computation over a cluster of machines using a Partitioned Global Address Space (PGAS). Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
+
+First you will need the OpenSHMEM libraries installed on your system. There are 3 options: [OpenMPI's OpenSHMEM](https://www.open-mpi.org), [OSSS-OpenSHMEM](https://github.com/openshmem-org/osss-ucx) and [Sandia-OpenSHMEM](https://github.com/Sandia-OpenSHMEM/SOS). OSSS-OpenSHMEM has a dependency on the [UCX](https://github.com/openucx/ucx) communication library. Sandia-OpenSHMEM can run over udp, [UCX](https://github.com/openucx/ucx), or [libfabric](https://github.com/ofiwg/libfabric). OpenMPI's OpenSHMEM can be installed with a package manager (apt, homebrew, etc). UCX, OSSS-OpenSHMEM, and Sandia-OpenSHMEM can all be installed from source.
+
+Next you will need to build the project with `LLAMA_OPENSHMEM` set to true on all machines; if you're building with `make`, you will also need to specify an OpenSHMEM-capable compiler (when building with CMake, this is configured automatically):
+
+- Using `make`:
+
+  ```bash
+  make CC=oshcc CXX=oshc++ LLAMA_MPI=1
+  ```
+
+- Using `CMake`:
+
+  ```bash
+  cmake -S . -B build -DLLAMA_MPI=ON -DCMAKE_C_COMPILER=oshcc -DCMAKE_CXX_COMPILER=oshc++
+  ```
+
+If you have access to a distributed file system (NFS) it's suggested you copy the programs and weights onto the distributed file system.
+
+Additionally, if you have a cluster with a bulk-synchronous scheduler ie: (Slurm)[https://slurm.schedmd.com] all you need to do is run the program from the distributed file system using the bulk-synchronous scheduler. The following example assumes a slurm cluster. The example additionally assumes  an NFS installation wth the distributed file system mounted with the following path on all machines: `/nfs_path`.
+
+```
+srun -n 2 /nfs_path/main -m /nfs_path/models/7B/ggml-model-q4_0.gguf -n 128
+```
+
+If you do not have access to a cluster with a bulk-synchronous scheduler or a distributed file system, the following instructions will help you stage an installation and run the application. Build the programs, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
+
+Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
+
+Here is an example hostfile:
+
+```
+192.168.0.1:1
+malvolio.local:1
+```
+
+The above will distribute the computation across 1 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive. It is a requirement of OpenSHMEM that the distributed job be performed over a number of machines that is equal to a power of 2.
+
+Finally, you're ready to run a computation using `mpirun`:
+
+```bash
+oshrun -hostfile hostfile -n 2 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
+```
+
 ### BLAS Build
 
 Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use: