From fcfe07f829f60e988c464e1db8e8163501dfd985 Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Wed, 20 Dec 2023 22:44:08 -0500
Subject: [PATCH 01/16] initial import

---
 CMakeLists.txt               |  25 +
 cmake/FindOpenSHMEM.cmake    | 917 +++++++++++++++++++++++++++++++++++
 cmake/FindOpenShmemPmi.cmake |  65 +++
 ggml-oshmem.c                | 346 +++++++++++++
 ggml-oshmem.h                |  43 ++
 llama.cpp                    |   3 +
 6 files changed, 1399 insertions(+)
 create mode 100644 cmake/FindOpenSHMEM.cmake
 create mode 100644 cmake/FindOpenShmemPmi.cmake
 create mode 100644 ggml-oshmem.c
 create mode 100644 ggml-oshmem.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e3cd43ab3..e58041af7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,6 +95,7 @@ option(LLAMA_CLBLAST                         "llama: use CLBlast"
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
+option(LLAMA_OPENSHMEM                       "llama: use OpenSHMEM"                             OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 
 option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
@@ -344,6 +345,29 @@ if (LLAMA_MPI)
     endif()
 endif()
 
+if (LLAMA_OPENSHMEM)
+  cmake_minimum_required(VERSION 3.10)
+  include(cmake/FindOpenSHMEM.cmake)
+
+  setup_openshmem()
+
+  if (OPENSHMEM_FOUND)
+    message(STATUS "OpenSHMEM found")
+    set(GGML_HEADERS_OPENSHMEM ggml-oshmem.h)
+    set(GGML_SOURCES_OPENSHMEM ggml-oshmem.c ggml-oshmem.h)
+    add_compile_definitions(GGML_USE_OPENSHMEM)
+
+    if (NOT MSVC)
+      add_compile_options(-Wno-cast-qual)
+    endif()
+    set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${OPENSHMEM_LDFLAGS})
+    string(REPLACE "-I" "" OPENSHMEM_CFLAGS ${OPENSHMEM_CFLAGS})
+    set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${OPENSHMEM_CFLAGS})
+  else()
+    message(WARNING "OpenSHMEM not found")
+  endif()
+endif()
+
 if (LLAMA_CLBLAST)
     find_package(CLBlast)
     if (CLBlast_FOUND)
@@ -722,6 +746,7 @@ add_library(ggml OBJECT
             ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
             ${GGML_SOURCES_METAL}  ${GGML_HEADERS_METAL}
             ${GGML_SOURCES_MPI}    ${GGML_HEADERS_MPI}
+            ${GGML_SOURCES_OPENSHMEM}    ${GGML_HEADERS_OPENSHMEM}
             ${GGML_SOURCES_EXTRA}  ${GGML_HEADERS_EXTRA}
             )
 
diff --git a/cmake/FindOpenSHMEM.cmake b/cmake/FindOpenSHMEM.cmake
new file mode 100644
index 000000000..550d7639f
--- /dev/null
+++ b/cmake/FindOpenSHMEM.cmake
@@ -0,0 +1,917 @@
+# Copyright (c) 2019-2023 Ste||ar Group
+#
+# SPDX-License-Identifier: BSL-1.0
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+#
+macro(setup_openshmem)
+ 
+  if(NOT TARGET PkgConfig::OPENSHMEM)
+
+    set(OPENSHMEM_PC "")
+
+    find_package(MPI)
+    if (LLAMA_MPI AND MPI_C_FOUND)
+      set(ENV{PKG_CONFIG_PATH} "$ENV{PKG_CONFIG_PATH}:${MPI_LIBDIR}/pkgconfig")
+
+      set(OPENSHMEM_PC "oshmem")
+      pkg_search_module(OPENSHMEM IMPORTED_TARGET GLOBAL ${OPENSHMEM_PC})
+
+      if(NOT OPENSHMEM_FOUND)
+        find_program(OSHMEM_INFO NAMES oshmem_info ompi_info REQUIRED)
+
+        if(NOT OSHMEM_INFO)
+          message(
+            FATAL_ERROR
+              "oshmem_info and/or ompi_info not found! pkg-config cannot find OpenMPI's `${OPENSHMEM_PC}.pc`"
+          )
+        endif()
+
+        set(OSHMEM_INFO_OUTPUT
+            "${CMAKE_CURRENT_SOURCE_DIR}/oshmem_info_stdout.log"
+        )
+        set(OSHMEM_INFO_ERROR
+            "${CMAKE_CURRENT_SOURCE_DIR}/oshmem_info_error.log"
+        )
+
+        execute_process(
+          COMMAND bash -c "${OSHMEM_INFO} --path libdir"
+          WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+          RESULT_VARIABLE OSHMEM_INFO_STATUS
+          OUTPUT_FILE ${OSHMEM_INFO_OUTPUT}
+          ERROR_FILE ${OSHMEM_INFO_ERROR}
+        )
+
+        if(OSHMEM_INFO_STATUS)
+          message(
+            FATAL_ERROR
+              "${OSHMEM_INFO} Failed! Program status code: ${OSHMEM_INFO_STATUS}"
+          )
+        endif()
+
+        file(READ ${OSHMEM_INFO_OUTPUT} OSHMEM_INFO_OUTPUT_CONTENT)
+
+        if(NOT DEFINED OSHMEM_INFO_OUTPUT_CONTENT)
+          message(
+            FATAL_ERROR
+              "${OSHMEM_INFO} Failed! Check: ${OSHMEM_INFO_ERROR}\n${OSHMEM_INFO_OUTPUT_CONTENT}"
+          )
+        endif()
+
+        if("${OSHMEM_INFO_OUTPUT_CONTENT}" STREQUAL "")
+          message(
+            FATAL_ERROR
+              "${OSHMEM_INFO} Failed! Check: ${OSHMEM_INFO_ERROR}\n${OSHMEM_INFO_OUTPUT_CONTENT}"
+          )
+        endif()
+
+        string(REGEX MATCH "(\/.*)" OSHMEM_LIBDIR_PATH
+                     ${OSHMEM_INFO_OUTPUT_CONTENT}
+        )
+
+        string(STRIP ${OSHMEM_LIBDIR_PATH} OSHMEM_LIBDIR_PATH)
+
+        set(ENV{PKG_CONFIG_PATH}
+            "$ENV{PKG_CONFIG_PATH}:${OSHMEM_LIBDIR_PATH}/pkgconfig"
+        )
+
+        pkg_search_module(OPENSHMEM IMPORTED_TARGET GLOBAL ${OPENSHMEM_PC})
+
+        if(NOT OPENSHMEM_FOUND)
+
+          set(OSHMEM_INFO_INCOUTPUT
+              "${CMAKE_CURRENT_SOURCE_DIR}/oshmem_info_stdout_inc.log"
+          )
+          set(OSHMEM_INFO_INCERROR
+              "${CMAKE_CURRENT_SOURCE_DIR}/oshmem_info_error_inc.log"
+          )
+
+          execute_process(
+            COMMAND bash -c "${OSHMEM_INFO} --path incdir"
+            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+            RESULT_VARIABLE OSHMEM_INFO_INCSTATUS
+            OUTPUT_FILE ${OSHMEM_INFO_INCOUTPUT}
+            ERROR_FILE ${OSHMEM_INFO_INCERROR}
+          )
+
+          if(OSHMEM_INFO_INCSTATUS)
+            message(
+              FATAL_ERROR
+                "${OSHMEM_INFO} Failed! Program status code: ${OSHMEM_INFO_INCSTATUS}"
+            )
+          endif()
+          file(READ ${OSHMEM_INFO_INCOUTPUT} OSHMEM_INFO_OUTPUT_INCCONTENT)
+
+          if(NOT DEFINED OSHMEM_INFO_OUTPUT_INCCONTENT)
+            message(
+              FATAL_ERROR
+                "${OSHMEM_INFO} Failed! Check: ${OSHMEM_INFO_INCERROR}"
+            )
+          endif()
+
+          if("${OSHMEM_INFO_OUTPUT_INCCONTENT}" STREQUAL "")
+            message(
+              FATAL_ERROR
+                "${OSHMEM_INFO} Failed! Check: ${OSHMEM_INFO_INCERROR}\n${OSHMEM_INFO_OUTPUT_INCCONTENT}"
+            )
+          endif()
+
+          string(REGEX MATCH "(\/.*)" OSHMEM_INCDIR_PATH
+                       ${OSHMEM_INFO_OUTPUT_INCCONTENT}
+          )
+
+          string(STRIP ${OSHMEM_INCDIR_PATH} OSHMEM_INCDIR_PATH)
+
+          set(OPENSHMEM_CFLAGS
+              "-I${OSHMEM_INCDIR_PATH} -pthread -I${OSHMEM_LIBDIR_PATH}"
+          )
+          set(OPENSHMEM_LDFLAGS "-loshmem")
+          set(OPENSHMEM_LIBRARY_DIRS "${OSHMEM_LIBDIR_PATH}")
+
+          add_library(PkgConfig::OPENSHMEM INTERFACE IMPORTED GLOBAL)
+
+          set(OPENSHMEM_FOUND ON)
+        endif()
+      endif()
+    else()
+
+      include(cmake/FindOpenShmemPmi.cmake)
+
+      set(PMI_AUTOCONF_OPTS "")
+      if(NOT PMI_LIBRARY OR NOT PMI_FOUND)
+        set(PMI_AUTOCONF_OPTS "--enable-pmi-simple")
+      else()
+        string(REGEX MATCH "(^\/[^\/]+)" PMI_INCLUDE_DIR_ROOT_PATH
+                     ${PMI_INCLUDE_DIR}
+        )
+        string(REGEX MATCH "(^\/[^\/]+)" PMI_LIBRARY_ROOT_PATH ${PMI_LIBRARY})
+        set(PMI_AUTOCONF_OPTS
+            "--with-pmi=${PMI_INCLUDE_DIR_ROOT_PATH} --with-pmi-libdir=${PMI_LIBRARY_ROOT_PATH}"
+        )
+      endif()
+
+      set(OPENSHMEM_PC "osss-ucx")
+
+      pkg_search_module(OPENSHMEM IMPORTED_TARGET GLOBAL ${OPENSHMEM_PC})
+      if(NOT OPENSHMEM_FOUND)
+        set(OPENSHMEM_PC "sandia-openshmem")
+        pkg_search_module(OPENSHMEM IMPORTED_TARGET GLOBAL ${OPENSHMEM_PC})
+      endif()
+    endif()
+  endif()
+
+  if(OPENSHMEM_CFLAGS)
+    set(IS_PARAM "0")
+    set(PARAM_FOUND "0")
+    set(NEWPARAM "")
+    set(IDX 0)
+    set(FLAG_LIST "")
+
+    foreach(X IN ITEMS ${OPENSHMEM_CFLAGS})
+      string(FIND "${X}" "--param" PARAM_FOUND)
+      if(NOT "${PARAM_FOUND}" EQUAL "-1")
+        set(IS_PARAM "1")
+        set(NEWPARAM "SHELL:${X}")
+      endif()
+      if("${PARAM_FOUND}" EQUAL "-1"
+         AND "${IS_PARAM}" EQUAL "0"
+         OR "${IS_PARAM}" EQUAL "-1"
+      )
+        list(APPEND FLAG_LIST "${X}")
+        set(IS_PARAM "0")
+      elseif("${PARAM_FOUND}" EQUAL "-1" AND "${IS_PARAM}" EQUAL "1")
+        list(APPEND FLAG_LIST "${NEWPARAM}
+          ${X}"
+        )
+        set(NEWPARAM "")
+        set(IS_PARAM "0")
+      endif()
+    endforeach()
+
+    list(LENGTH OPENSHMEM_CFLAGS IDX)
+    foreach(X RANGE ${IDX})
+      list(POP_FRONT OPENSHMEM_CFLAGS NEWPARAM)
+    endforeach()
+
+    foreach(X IN ITEMS ${FLAG_LIST})
+      list(APPEND OPENSHMEM_CFLAGS "${X}")
+    endforeach()
+  endif()
+
+  if(OPENSHMEM_CFLAGS_OTHER)
+    set(IS_PARAM "0")
+    set(PARAM_FOUND "0")
+    set(NEWPARAM "")
+    set(IDX 0)
+    set(FLAG_LIST "")
+
+    foreach(X IN ITEMS ${OPENSHMEM_CFLAGS_OTHER})
+      string(FIND "${X}" "--param" PARAM_FOUND)
+      if(NOT "${PARAM_FOUND}" EQUAL "-1")
+        set(IS_PARAM "1")
+        set(NEWPARAM "SHELL:${X}")
+      endif()
+      if("${PARAM_FOUND}" EQUAL "-1"
+         AND "${IS_PARAM}" EQUAL "0"
+         OR "${IS_PARAM}" EQUAL "-1"
+      )
+        list(APPEND FLAG_LIST "${X}")
+        set(IS_PARAM "0")
+      elseif("${PARAM_FOUND}" EQUAL "-1" AND "${IS_PARAM}" EQUAL "1")
+        list(APPEND FLAG_LIST "${NEWPARAM}
+          ${X}"
+        )
+        set(NEWPARAM "")
+        set(IS_PARAM "0")
+      endif()
+    endforeach()
+
+    list(LENGTH OPENSHMEM_CFLAGS_OTHER IDX)
+    foreach(X RANGE ${IDX})
+      list(POP_FRONT OPENSHMEM_CFLAGS_OTHER NEWPARAM)
+    endforeach()
+
+    foreach(X IN ITEMS ${FLAG_LIST})
+      list(APPEND OPENSHMEM_CFLAGS_OTHER "${X}")
+    endforeach()
+  endif()
+
+  if(OPENSHMEM_LDFLAGS)
+    set(IS_PARAM "0")
+    set(PARAM_FOUND "0")
+    set(NEWPARAM "")
+    set(IDX 0)
+    set(DIRIDX 0)
+    set(SKIP 0)
+    set(FLAG_LIST "")
+    set(DIR_LIST "")
+    set(LIB_LIST "")
+
+    foreach(X IN ITEMS ${OPENSHMEM_LDFLAGS})
+      string(FIND "${X}" "--param" PARAM_FOUND)
+      string(FIND "${X}" "-lsma" IDX)
+      string(FIND "${X}" "-l" LIDX)
+      string(FIND "${X}" "-L" DIRIDX)
+      string(FIND "${X}" "-Wl" SKIP)
+
+      if("${SKIP}" EQUAL "-1")
+        if(NOT "${PARAM_FOUND}" EQUAL "-1")
+          set(IS_PARAM "1")
+          set(NEWPARAM "SHELL:${X}")
+        endif()
+        if("${PARAM_FOUND}" EQUAL "-1"
+           AND "${IDX}" EQUAL "-1"
+           AND "${IS_PARAM}" EQUAL "0"
+           OR "${IS_PARAM}" EQUAL "-1"
+        )
+          list(APPEND FLAG_LIST "${X}")
+          set(IS_PARAM "0")
+        elseif("${PARAM_FOUND}" EQUAL "-1" AND "${IS_PARAM}" EQUAL "1")
+          list(APPEND FLAG_LIST "${NEWPARAM}
+          ${X}"
+          )
+          set(NEWPARAM "")
+          set(IS_PARAM "0")
+        elseif(NOT "${IDX}" EQUAL "-1" AND NOT "${LIDX}" EQUAL "-1")
+          set(TMPSTR "")
+          string(REPLACE "-l" "" TMPSTR "${X}")
+          list(APPEND LIB_LIST "${TMPSTR}")
+          set(IDX 0)
+        elseif("${IDX}" EQUAL "-1" AND NOT "${LIDX}" EQUAL "-1")
+          list(APPEND FLAG_LIST "${X}")
+        endif()
+        if(NOT "${DIRIDX}" EQUAL "-1")
+          set(TMPSTR "")
+          string(REPLACE "-L" "" TMPSTR "${X}")
+          list(APPEND DIR_LIST "${TMPSTR}")
+        endif()
+      endif()
+    endforeach()
+
+    set(IDX 0)
+    list(LENGTH LIB_LIST IDX)
+
+    if(NOT "${IDX}" EQUAL "0")
+      set(IDX 0)
+
+      if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        set(NEWLINK "SHELL:-Wl,--whole-archive
+          "
+        )
+        foreach(X IN ITEMS ${LIB_LIST})
+          set(DIRSTR "")
+          string(REPLACE ";" "
+          " DIRSTR "${DIR_LIST}"
+          )
+          foreach(Y IN ITEMS ${DIR_LIST})
+            find_library(
+              FOUND_LIB
+              NAMES ${X} "lib${X}" "lib${X}.a"
+              PATHS ${Y}
+              HINTS ${Y} NO_CACHE
+              NO_CMAKE_FIND_ROOT_PATH NO_DEFAULT_PATH
+            )
+
+            list(LENGTH FOUND_LIB IDX)
+            if(NOT "${IDX}" EQUAL "0")
+              string(APPEND NEWLINK "${FOUND_LIB}")
+              set(FOUND_LIB "")
+            endif()
+          endforeach()
+        endforeach()
+        string(APPEND NEWLINK "
+          -Wl,--no-whole-archive"
+        )
+        string(FIND "SHELL:-Wl,--whole-archive
+          -Wl,--no-whole-archive" "${NEWLINK}" IDX
+        )
+        if("${IDX}" EQUAL "-1")
+          list(APPEND OPENSHMEM_LDFLAGS "${NEWLINK}")
+       endif()
+      elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        if(APPLE)
+          set(NEWLINK "SHELL:-Wl,-force_load,")
+        else()
+          set(NEWLINK "SHELL:
+          "
+          )
+        endif()
+        foreach(X IN ITEMS ${LIB_LIST})
+          set(DIRSTR "")
+          string(REPLACE ";" "
+          " DIRSTR "${DIR_LIST}"
+          )
+          foreach(Y IN ITEMS ${DIR_LIST})
+            find_library(
+              FOUND_LIB
+              NAMES ${X} "lib${X}" "lib${X}.a"
+              PATHS ${Y}
+              HINTS ${Y} NO_CACHE
+              NO_CMAKE_FIND_ROOT_PATH NO_DEFAULT_PATH
+            )
+
+            list(LENGTH FOUND_LIB IDX)
+            if(NOT "${IDX}" EQUAL "0")
+              string(APPEND NEWLINK "${FOUND_LIB}")
+              set(FOUND_LIB "")
+            endif()
+          endforeach()
+        endforeach()
+        string(FIND "SHELL:" "${NEWLINK}" IDX)
+        if("${IDX}" EQUAL "-1")
+          list(APPEND OPENSHMEM_LDFLAGS "${NEWLINK}")
+        endif()
+      endif()
+    endif()
+  endif()
+
+  if(OPENSHMEM_LDFLAGS_OTHER)
+    unset(FOUND_LIB)
+    set(IS_PARAM "0")
+    set(PARAM_FOUND "0")
+    set(NEWPARAM "")
+    set(SKIP 0)
+    set(IDX 0)
+    set(DIRIDX 0)
+    set(FLAG_LIST "")
+    set(DIR_LIST "")
+    set(LIB_LIST "")
+
+    foreach(X IN ITEMS ${OPENSHMEM_LDFLAGS_OTHER})
+      string(FIND "${X}" "--param" PARAM_FOUND)
+      string(FIND "${X}" "-lsma" IDX)
+      string(FIND "${X}" "-L" DIRIDX)
+      string(FIND "${X}" "-Wl" SKIP)
+
+      if("${SKIP}" EQUAL "-1")
+        if(NOT "${PARAM_FOUND}" EQUAL "-1")
+          set(IS_PARAM "1")
+          set(NEWPARAM "SHELL:${X}")
+        endif()
+        if("${PARAM_FOUND}" EQUAL "-1"
+           AND "${IDX}" EQUAL "-1"
+           AND "${IS_PARAM}" EQUAL "0"
+           OR "${IS_PARAM}" EQUAL "-1"
+        )
+          list(APPEND FLAG_LIST "${X}")
+          set(IS_PARAM "0")
+        elseif("${PARAM_FOUND}" EQUAL "-1" AND "${IS_PARAM}" EQUAL "1")
+          list(APPEND FLAG_LIST "${NEWPARAM}
+          ${X}"
+          )
+          set(NEWPARAM "")
+          set(IS_PARAM "0")
+        elseif(NOT "${IDX}" EQUAL "-1" AND NOT "${LIDX}" EQUAL "-1")
+          set(TMPSTR "")
+          string(REPLACE "-l" "" TMPSTR "${X}")
+          list(APPEND LIB_LIST "${TMPSTR}")
+          set(IDX 0)
+        elseif("${IDX}" EQUAL "-1" AND NOT "${LIDX}" EQUAL "-1")
+          list(APPEND FLAG_LIST "${X}")
+        endif()
+        if(NOT "${DIRIDX}" EQUAL "-1")
+          set(TMPSTR "")
+          string(REPLACE "-L" "" TMPSTR "${X}")
+          list(APPEND DIR_LIST "${TMPSTR}")
+        endif()
+      endif()
+    endforeach()
+
+    set(IDX 0)
+    list(LENGTH OPENSHMEM_LDFLAGS_OTHER IDX)
+    foreach(X RANGE ${IDX})
+      list(POP_FRONT OPENSHMEM_LDFLAGS_OTHER NEWPARAM)
+    endforeach()
+
+    foreach(X IN ITEMS ${FLAG_LIST})
+      list(APPEND OPENSHMEM_LDFLAGS_OTHER "${X}")
+    endforeach()
+
+    set(IDX 0)
+    list(LENGTH LIB_LIST IDX)
+    if(NOT "${IDX}" EQUAL "0")
+      set(IDX 0)
+      if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        set(NEWLINK "SHELL:-Wl,--whole-archive
+          "
+        )
+        foreach(X IN ITEMS ${LIB_LIST})
+          set(DIRSTR "")
+          string(REPLACE ";" "
+          " DIRSTR "${DIR_LIST}"
+          )
+          foreach(Y IN ITEMS ${DIR_LIST})
+            find_library(
+              FOUND_LIB
+              NAMES ${X} "lib${X}" "lib${X}.a"
+              PATHS ${Y}
+              HINTS ${Y} NO_CACHE
+              NO_CMAKE_FIND_ROOT_PATH NO_DEFAULT_PATH
+            )
+
+            list(LENGTH FOUND_LIB IDX)
+            if(NOT "${IDX}" EQUAL "0")
+              string(APPEND NEWLINK "${FOUND_LIB}")
+              set(FOUND_LIB "")
+            endif()
+          endforeach()
+        endforeach()
+        string(APPEND NEWLINK "
+          -Wl,--no-whole-archive"
+        )
+
+        string(FIND "SHELL:-Wl,--whole-archive
+          -Wl,--no-whole-archive" "${NEWLINK}" IDX
+        )
+        if("${IDX}" EQUAL "-1")
+          list(APPEND OPENSHMEM_LDFLAGS_OTHER "${NEWLINK}")
+        endif()
+      elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        if(APPLE)
+          set(NEWLINK "SHELL:-Wl,-force_load,")
+        else()
+          set(NEWLINK "SHELL:
+          "
+          )
+        endif()
+        foreach(X IN ITEMS ${LIB_LIST})
+          set(DIRSTR "")
+          string(REPLACE ";" "
+          " DIRSTR "${DIR_LIST}"
+          )
+          foreach(Y IN ITEMS ${DIR_LIST})
+            find_library(
+              FOUND_LIB
+              NAMES ${X} "lib${X}" "lib${X}.a"
+              PATHS ${Y}
+              HINTS ${Y} NO_CACHE
+              NO_CMAKE_FIND_ROOT_PATH NO_DEFAULT_PATH
+            )
+
+            list(LENGTH FOUND_LIB IDX)
+            if(NOT "${IDX}" EQUAL "0")
+              string(APPEND NEWLINK "${FOUND_LIB}")
+              set(FOUND_LIB "")
+            endif()
+          endforeach()
+        endforeach()
+        string(FIND "SHELL:" "${NEWLINK}" IDX)
+        if("${IDX}" EQUAL "-1")
+          list(APPEND OPENSHMEM_LDFLAGS "${NEWLINK}")
+        endif()
+      endif()
+    endif()
+  endif()
+
+  if(OPENSHMEM_STATIC_CFLAGS)
+    set(IS_PARAM "0")
+    set(PARAM_FOUND "0")
+    set(NEWPARAM "")
+    set(IDX 0)
+    set(FLAG_LIST "")
+
+    foreach(X IN ITEMS ${OPENSHMEM_STATIC_CFLAGS})
+      string(FIND "${X}" "--param" PARAM_FOUND)
+      if(NOT "${PARAM_FOUND}" EQUAL "-1")
+        set(IS_PARAM "1")
+        set(NEWPARAM "SHELL:${X}")
+      endif()
+      if("${PARAM_FOUND}" EQUAL "-1"
+         AND "${IS_PARAM}" EQUAL "0"
+         OR "${IS_PARAM}" EQUAL "-1"
+      )
+        list(APPEND FLAG_LIST "${X}")
+        set(IS_PARAM "0")
+      elseif("${PARAM_FOUND}" EQUAL "-1" AND "${IS_PARAM}" EQUAL "1")
+        list(APPEND FLAG_LIST "${NEWPARAM}
+          ${X}"
+        )
+        set(NEWPARAM "")
+        set(IS_PARAM "0")
+      endif()
+    endforeach()
+
+    list(LENGTH OPENSHMEM_STATIC_CFLAGS IDX)
+    foreach(X RANGE ${IDX})
+      list(POP_FRONT OPENSHMEM_STATIC_CFLAGS NEWPARAM)
+    endforeach()
+
+    foreach(X IN ITEMS ${FLAG_LIST})
+      list(APPEND OPENSHMEM_STATIC_CFLAGS "${X}")
+    endforeach()
+  endif()
+
+  if(OPENSHMEM_STATIC_CFLAGS_OTHER)
+    set(IS_PARAM "0")
+    set(PARAM_FOUND "0")
+    set(NEWPARAM "")
+    set(IDX 0)
+    set(FLAG_LIST "")
+   foreach(X IN ITEMS ${OPENSHMEM_STATIC_CFLAGS_OTHER})
+      string(FIND "${X}" "--param" PARAM_FOUND)
+      if(NOT "${PARAM_FOUND}" EQUAL "-1")
+        set(IS_PARAM "1")
+        set(NEWPARAM "SHELL:${X}")
+      endif()
+      if("${PARAM_FOUND}" EQUAL "-1"
+         AND "${IS_PARAM}" EQUAL "0"
+         OR "${IS_PARAM}" EQUAL "-1"
+      )
+        list(APPEND FLAG_LIST "${X}")
+        set(IS_PARAM "0")
+      elseif("${PARAM_FOUND}" EQUAL "-1" AND "${IS_PARAM}" EQUAL "1")
+        list(APPEND FLAG_LIST "${NEWPARAM}
+          ${X}"
+        )
+        set(NEWPARAM "")
+        set(IS_PARAM "0")
+      endif()
+    endforeach()
+
+    list(LENGTH OPENSHMEM_STATIC_CFLAGS_OTHER IDX)
+    foreach(X RANGE ${IDX})
+      list(POP_FRONT OPENSHMEM_STATIC_CFLAGS_OTHER NEWPARAM)
+    endforeach()
+
+    foreach(X IN ITEMS ${FLAG_LIST})
+      list(APPEND OPENSHMEM_STATIC_CFLAGS_OTHER "${X}")
+    endforeach()
+  endif()
+
+  if(OPENSHMEM_STATIC_LDFLAGS)
+    unset(FOUND_LIB)
+    set(IS_PARAM "0")
+    set(PARAM_FOUND "0")
+    set(NEWPARAM "")
+    set(SKIP 0)
+    set(IDX 0)
+    set(DIRIDX 0)
+    set(FLAG_LIST "")
+    set(DIR_LIST "")
+    set(LIB_LIST "")
+    foreach(X IN ITEMS ${OPENSHMEM_STATIC_LDFLAGS})
+      string(FIND "${X}" "--param" PARAM_FOUND)
+      if("${HPX_WITH_PARCELPORT_OPENSHMEM_CONDUIT}" STREQUAL "mpi")
+        string(FIND "${X}" "-loshmem" IDX)
+      else()
+        string(FIND "${X}" "-lsma" IDX)
+      endif()
+      string(FIND "${X}" "-L" DIRIDX)
+      string(FIND "${X}" "-Wl" SKIP)
+
+      if("${SKIP}" EQUAL "-1")
+        if(NOT "${PARAM_FOUND}" EQUAL "-1")
+          set(IS_PARAM "1")
+          set(NEWPARAM "SHELL:${X}")
+        endif()
+        if("${PARAM_FOUND}" EQUAL "-1"
+           AND "${IDX}" EQUAL "-1"
+           AND "${IS_PARAM}" EQUAL "0"
+           OR "${IS_PARAM}" EQUAL "-1"
+        )
+          list(APPEND FLAG_LIST "${X}")
+          set(IS_PARAM "0")
+        elseif("${PARAM_FOUND}" EQUAL "-1" AND "${IS_PARAM}" EQUAL "1")
+          list(APPEND FLAG_LIST "${NEWPARAM}
+          ${X}"
+          )
+          set(NEWPARAM "")
+          set(IS_PARAM "0")
+        elseif(NOT "${IDX}" EQUAL "-1" AND NOT "${LIDX}" EQUAL "-1")
+          set(TMPSTR "")
+          string(REPLACE "-l" "" TMPSTR "${X}")
+          list(APPEND LIB_LIST "${TMPSTR}")
+          set(IDX 0)
+        elseif("${IDX}" EQUAL "-1" AND NOT "${LIDX}" EQUAL "-1")
+          list(APPEND FLAG_LIST "${X}")
+        endif()
+        if(NOT "${DIRIDX}" EQUAL "-1")
+          set(TMPSTR "")
+          string(REPLACE "-L" "" TMPSTR "${X}")
+          list(APPEND DIR_LIST "${TMPSTR}")
+        endif()
+      endif()
+    endforeach()
+    set(IDX 0)
+    list(LENGTH OPENSHMEM_STATIC_LDFLAGS IDX)
+    foreach(X RANGE ${IDX})
+      list(POP_FRONT OPENSHMEM_STATIC_LDFLAGS NEWPARAM)
+    endforeach()
+
+    foreach(X IN ITEMS ${FLAG_LIST})
+      list(APPEND OPENSHMEM_STATIC_LDFLAGS "${X}")
+    endforeach()
+
+    set(IDX 0)
+    list(LENGTH LIB_LIST IDX)
+    if(NOT "${IDX}" EQUAL "0")
+      set(IDX 0)
+      if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        set(NEWLINK "SHELL:-Wl,--whole-archive
+          "
+        )
+        foreach(X IN ITEMS ${LIB_LIST})
+          set(DIRSTR "")
+          string(REPLACE ";" "
+          " DIRSTR "${DIR_LIST}"
+          )
+          foreach(Y IN ITEMS ${DIR_LIST})
+            find_library(
+              FOUND_LIB
+              NAMES ${X} "lib${X}" "lib${X}.a"
+              PATHS ${Y}
+              HINTS ${Y} NO_CACHE
+              NO_CMAKE_FIND_ROOT_PATH NO_DEFAULT_PATH
+            )
+
+            list(LENGTH FOUND_LIB IDX)
+
+            if(NOT "${IDX}" EQUAL "0")
+              string(APPEND NEWLINK "${FOUND_LIB}")
+              set(FOUND_LIB "")
+            endif()
+          endforeach()
+        endforeach()
+        string(APPEND NEWLINK "
+          -Wl,--no-whole-archive"
+        )
+
+        string(FIND "SHELL:-Wl,--whole-archive
+          -Wl,--no-whole-archive" "${NEWLINK}" IDX
+        )
+        if("${IDX}" EQUAL "-1")
+          list(APPEND OPENSHMEM_STATIC_LDFLAGS "${NEWLINK}")
+        endif()
+     elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        if(APPLE)
+          set(NEWLINK "SHELL:-Wl,-force_load,")
+        else()
+          set(NEWLINK "SHELL:
+          "
+          )
+        endif()
+        foreach(X IN ITEMS ${LIB_LIST})
+          set(DIRSTR "")
+          string(REPLACE ";" "
+          " DIRSTR "${DIR_LIST}"
+          )
+          foreach(Y IN ITEMS ${DIR_LIST})
+            find_library(
+              FOUND_LIB
+              NAMES ${X} "lib${X}" "lib${X}.a"
+              PATHS ${Y}
+              HINTS ${Y} NO_CACHE
+              NO_CMAKE_FIND_ROOT_PATH NO_DEFAULT_PATH
+            )
+
+            list(LENGTH FOUND_LIB IDX)
+            if(NOT "${IDX}" EQUAL "0")
+              string(APPEND NEWLINK "${FOUND_LIB}")
+              set(FOUND_LIB "")
+            endif()
+          endforeach()
+        endforeach()
+        string(FIND "SHELL:" "${NEWLINK}" IDX)
+        if("${IDX}" EQUAL "-1")
+          list(APPEND OPENSHMEM_LDFLAGS "${NEWLINK}")
+        endif()
+      endif()
+    endif()
+  endif()
+
+  if(OPENSHMEM_STATIC_LDFLAGS_OTHER)
+    unset(FOUND_LIB)
+    set(IS_PARAM "0")
+    set(PARAM_FOUND "0")
+    set(NEWPARAM "")
+    set(SKIP 0)
+    set(IDX 0)
+    set(DIRIDX 0)
+    set(FLAG_LIST "")
+    set(DIR_LIST "")
+    set(LIB_LIST "")
+
+    foreach(X IN ITEMS ${OPENSHMEM_STATIC_LDFLAGS_OTHER})
+      string(FIND "${X}" "--param" PARAM_FOUND)
+      if("${HPX_WITH_PARCELPORT_OPENSHMEM_CONDUIT}" STREQUAL "mpi")
+        string(FIND "${X}" "-loshmem" IDX)
+      else()
+        string(FIND "${X}" "-lsma" IDX)
+      endif()
+      string(FIND "${X}" "-L" DIRIDX)
+      string(FIND "${X}" "-Wl" SKIP)
+
+      if("${SKIP}" EQUAL "-1")
+        if(NOT "${PARAM_FOUND}" EQUAL "-1")
+          set(IS_PARAM "1")
+          set(NEWPARAM "SHELL:${X}")
+        endif()
+        if("${PARAM_FOUND}" EQUAL "-1"
+           AND "${IDX}" EQUAL "-1"
+           AND "${IS_PARAM}" EQUAL "0"
+           OR "${IS_PARAM}" EQUAL "-1"
+        )
+          list(APPEND FLAG_LIST "${X}")
+          set(IS_PARAM "0")
+        elseif("${PARAM_FOUND}" EQUAL "-1" AND "${IS_PARAM}" EQUAL "1")
+          list(APPEND FLAG_LIST "${NEWPARAM}
+          ${X}"
+          )
+          set(NEWPARAM "")
+          set(IS_PARAM "0")
+        elseif(NOT "${IDX}" EQUAL "-1" AND NOT "${LIDX}" EQUAL "-1")
+          set(TMPSTR "")
+          string(REPLACE "-l" "" TMPSTR "${X}")
+          list(APPEND LIB_LIST "${TMPSTR}")
+          set(IDX 0)
+        elseif("${IDX}" EQUAL "-1" AND NOT "${LIDX}" EQUAL "-1")
+          list(APPEND FLAG_LIST "${X}")
+        endif()
+        if(NOT "${DIRIDX}" EQUAL "-1")
+          set(TMPSTR "")
+          string(REPLACE "-L" "" TMPSTR "${X}")
+          list(APPEND DIR_LIST "${TMPSTR}")
+        endif()
+      endif()
+    endforeach()
+
+    set(IDX 0)
+    list(LENGTH OPENSHMEM_STATIC_LDFLAGS_OTHER IDX)
+    foreach(X RANGE ${IDX})
+      list(POP_FRONT OPENSHMEM_STATIC_LDFLAGS_OTHER NEWPARAM)
+    endforeach()
+
+    foreach(X IN ITEMS ${FLAG_LIST})
+      list(APPEND OPENSHMEM_STATIC_LDFLAGS_OTHER "${X}")
+    endforeach()
+
+    set(IDX 0)
+    list(LENGTH LIB_LIST IDX)
+    if(NOT "${IDX}" EQUAL "0")
+      set(IDX 0)
+      if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        set(NEWLINK "SHELL:-Wl,--whole-archive
+          "
+        )
+        foreach(X IN ITEMS ${LIB_LIST})
+          set(DIRSTR "")
+          string(REPLACE ";" "
+          " DIRSTR "${DIR_LIST}"
+          )
+          foreach(Y IN ITEMS ${DIR_LIST})
+            find_library(
+              FOUND_LIB
+              NAMES ${X} "lib${X}" "lib${X}.a"
+              PATHS ${Y}
+              HINTS ${Y} NO_CACHE
+              NO_CMAKE_FIND_ROOT_PATH NO_DEFAULT_PATH
+            )
+
+            list(LENGTH FOUND_LIB IDX)
+
+            message(STATUS "${FOUND_LIB}
+          ${X}"
+            )
+            if(NOT "${IDX}" EQUAL "0")
+              string(APPEND NEWLINK "${FOUND_LIB}")
+              set(FOUND_LIB "")
+            endif()
+          endforeach()
+        endforeach()
+        string(APPEND NEWLINK "
+          -Wl,--no-whole-archive"
+        )
+        string(FIND "SHELL:-Wl,--whole-archive
+          -Wl,--no-whole-archive" "${NEWLINK}" IDX
+        )
+        if("${IDX}" EQUAL "-1")
+          list(APPEND OPENSHMEM_STATIC_LDFLAGS_OTHER "${NEWLINK}")
+        endif()
+      elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        if(APPLE)
+          set(NEWLINK "SHELL:-Wl,-force_load,")
+        else()
+          set(NEWLINK "SHELL:
+          "
+          )
+        endif()
+        foreach(X IN ITEMS ${LIB_LIST})
+          set(DIRSTR "")
+          string(REPLACE ";" "
+          " DIRSTR "${DIR_LIST}"
+          )
+          foreach(Y IN ITEMS ${DIR_LIST})
+            find_library(
+              FOUND_LIB
+              NAMES ${X} "lib${X}" "lib${X}.a"
+              PATHS ${Y}
+              HINTS ${Y} NO_CACHE
+              NO_CMAKE_FIND_ROOT_PATH NO_DEFAULT_PATH
+            )
+
+            list(LENGTH FOUND_LIB IDX)
+            if(NOT "${IDX}" EQUAL "0")
+              string(APPEND NEWLINK "${FOUND_LIB}")
+              set(FOUND_LIB "")
+            endif()
+          endforeach()
+        endforeach()
+        string(FIND "SHELL:" "${NEWLINK}" IDX)
+        if("${IDX}" EQUAL "-1")
+          list(APPEND OPENSHMEM_LDFLAGS "${NEWLINK}")
+        endif()
+      endif()
+    endif()
+  endif()
+
+  if(OPENSHMEM_DIR)
+    list(TRANSFORM OPENSHMEM_CFLAGS
+         REPLACE "${OPENSHMEM_DIR}/install"
+                 "$<BUILD_INTERFACE:${OPENSHMEM_DIR}/install>"
+    )
+    list(TRANSFORM OPENSHMEM_LDFLAGS
+         REPLACE "${OPENSHMEM_DIR}/install"
+                 "$<BUILD_INTERFACE:${OPENSHMEM_DIR}/install>"
+    )
+    list(TRANSFORM OPENSHMEM_LIBRARY_DIRS
+         REPLACE "${OPENSHMEM_DIR}/install"
+                 "$<BUILD_INTERFACE:${OPENSHMEM_DIR}/install>"
+    )
+
+    message(STATUS "OPENSHMEM_CFLAGS:\t${OPENSHMEM_CFLAGS}")
+    message(STATUS "OPENSHMEM_LDFLAGS:\t${OPENSHMEM_LDFLAGS}")
+    message(STATUS "OPENSHMEM_LIBRARY_DIRS:\t${OPENSHMEM_LIBRARY_DIRS}")
+
+    set_target_properties(
+      PkgConfig::OPENSHMEM PROPERTIES INTERFACE_COMPILE_OPTIONS
+                                      "${OPENSHMEM_CFLAGS}"
+    )
+    set_target_properties(
+      PkgConfig::OPENSHMEM PROPERTIES INTERFACE_LINK_OPTIONS
+                                      "${OPENSHMEM_LDFLAGS}"
+    )
+    set_target_properties(
+      PkgConfig::OPENSHMEM PROPERTIES INTERFACE_LINK_DIRECTORIES
+                                      "${OPENSHMEM_LIBRARY_DIRS}"
+    )
+    set(OPENSHMEM_FOUND ON)
+  else()
+    message(STATUS "OPENSHMEM_CFLAGS:\t${OPENSHMEM_CFLAGS}")
+    message(STATUS "OPENSHMEM_LDFLAGS:\t${OPENSHMEM_LDFLAGS}")
+    message(STATUS "OPENSHMEM_LIBRARY_DIRS:\t${OPENSHMEM_LIBRARY_DIRS}")
+
+    set_target_properties(
+      PkgConfig::OPENSHMEM PROPERTIES INTERFACE_COMPILE_OPTIONS
+                                      "${OPENSHMEM_CFLAGS}"
+    )
+    set_target_properties(
+      PkgConfig::OPENSHMEM PROPERTIES INTERFACE_LINK_OPTIONS
+                                      "${OPENSHMEM_LDFLAGS}"
+    )
+    set_target_properties(
+      PkgConfig::OPENSHMEM PROPERTIES INTERFACE_LINK_DIRECTORIES
+                                      "${OPENSHMEM_LIBRARY_DIRS}"
+    )
+    set(OPENSHMEM_FOUND ON)
+  endif()
+endmacro()
diff --git a/cmake/FindOpenShmemPmi.cmake b/cmake/FindOpenShmemPmi.cmake
new file mode 100644
index 000000000..5f6814a50
--- /dev/null
+++ b/cmake/FindOpenShmemPmi.cmake
@@ -0,0 +1,65 @@
+# Copyright (c)      2023 Christopher Taylor
+#
+# SPDX-License-Identifier: BSL-1.0
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+find_package(PkgConfig QUIET)
+# look for cray pmi...
+pkg_check_modules(PC_PMI_CRAY QUIET cray-pmi)
+# look for the rest if we couldn't find the cray package
+if(NOT PC_PMI_CRAY_FOUND)
+  pkg_check_modules(PC_PMI QUIET pmi)
+endif()
+
+find_path(
+  PMI_INCLUDE_DIR pmi2.h
+  HINTS ${PMI_ROOT}
+        ENV
+        PMI_ROOT
+        ${PMI_DIR}
+        ENV
+        PMI_DIR
+        ${PC_PMI_CRAY_INCLUDEDIR}
+        ${PC_PMI_CRAY_INCLUDE_DIRS}
+        ${PC_PMI_INCLUDEDIR}
+        ${PC_PMI_INCLUDE_DIRS}
+  PATH_SUFFIXES include
+)
+
+find_library(
+  PMI_LIBRARY
+  NAMES pmi
+  HINTS ${PMI_ROOT}
+        ENV
+        PMI_ROOT
+        ${PC_PMI_CRAY_LIBDIR}
+        ${PC_PMI_CRAY_LIBRARY_DIRS}
+        ${PC_PMI_LIBDIR}
+        ${PC_PMI_LIBRARY_DIRS}
+  PATH_SUFFIXES lib lib64
+)
+
+# Set PMI_ROOT in case the other hints are used
+if(PMI_ROOT)
+  # The call to file is for compatibility with windows paths
+  file(TO_CMAKE_PATH ${PMI_ROOT} PMI_ROOT)
+elseif("$ENV{PMI_ROOT}")
+  file(TO_CMAKE_PATH $ENV{PMI_ROOT} PMI_ROOT)
+else()
+  file(TO_CMAKE_PATH "${PMI_INCLUDE_DIR}" PMI_INCLUDE_DIR)
+  string(REPLACE "/include" "" PMI_ROOT "${PMI_INCLUDE_DIR}")
+endif()
+
+if(NOT PMI_LIBRARY OR NOT PMI_INCLUDE_DIR)
+  set(PMI_FOUND=OFF)
+  return()
+endif()
+
+# hpx_error( "PMI_LIBRARY OR PMI_INCLUDE_DIR not found, please install PMI or
+# set \ the right PMI_ROOT path" )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(PMI DEFAULT_MSG PMI_LIBRARY PMI_INCLUDE_DIR)
+
+mark_as_advanced(PMI_ROOT PMI_LIBRARY PMI_INCLUDE_DIR)
diff --git a/ggml-oshmem.c b/ggml-oshmem.c
new file mode 100644
index 000000000..6acc3b5d4
--- /dev/null
+++ b/ggml-oshmem.c
@@ -0,0 +1,346 @@
+#include "ggml-oshmem.h"
+
+#include "ggml.h"
+
+#include <shmem.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+#define UNUSED GGML_UNUSED
+
+#define OPENSHMEM_SYMMETRIC_BUFFER_SIZE 4096
+
+struct ggml_openshmem_context {
+    int pe;
+    int n_pes;
+    int64_t symmetric_buffer_size;
+    int64_t symmetric_comm_structure_size;
+    uint8_t * symmetric_comm_structure;
+    long * recv_signal;
+};
+
+void ggml_openshmem_backend_init(void) {
+    shmem_init();
+}
+
+void ggml_openshmem_backend_free(void) {
+    shmem_finalize();
+}
+
+struct ggml_openshmem_context * ggml_openshmem_init(void) {
+    struct ggml_openshmem_context * ctx = calloc(1, sizeof(struct ggml_openshmem_context));
+
+    ctx->pe = shmem_my_pe(); 
+    ctx->n_pes = shmem_n_pes();
+
+    /*
+     * makes a symmetric heap allocation on all processing elements (processes running this SPMD program)
+     *
+     * below is a struct representing the layout of the symmetric allocation:
+     *
+     * {
+     *     int64_t offset_in_buffer,
+     *     int64_t length_in_buffer,
+     *     uint8_t buffer[shmem_npes()][OPENSHMEM_SYMMETRIC_BUFFER_SIZE]
+     * }
+     *
+     */
+    ctx->symmetric_buffer_size = OPENSHMEM_SYMMETRIC_BUFFER_SIZE;
+    ctx->symmetric_comm_structure_size = OPENSHMEM_SYMMETRIC_BUFFER_SIZE + sizeof(int64_t) + sizeof(int64_t);
+    ctx->symmetric_comm_structure = (uint8_t*)shmem_calloc(1, ctx->n_pes*ctx->symmetric_comm_structure_size);
+
+    /*
+     * uint8_t signal_byte[shmem_npes()];
+     */
+    ctx->recv_signal = (long*)shmem_calloc(1, ctx->n_pes*sizeof(long));
+
+    return ctx;
+}
+
+void ggml_openshmem_free(struct ggml_openshmem_context * ctx) {
+    free(ctx);
+}
+
+int ggml_openshmem_pe(struct ggml_openshmem_context * ctx) {
+    return ctx->pe;
+}
+
+void ggml_openshmem_eval_init(
+        struct ggml_openshmem_context * ctx_openshmem,
+        int * n_tokens,
+        int * n_past,
+        int * n_threads) {
+    UNUSED(ctx_openshmem);
+
+    // synchronize the worker node parameters with the root node
+    shmem_barrier_all();
+
+    shmem_broadcast(SHMEM_TEAM_WORLD, n_tokens, n_tokens, 1, 0);
+    shmem_broadcast(SHMEM_TEAM_WORLD, n_past, n_tokens, 1, 0);
+    shmem_broadcast(SHMEM_TEAM_WORLD, n_threads, n_tokens, 1, 0);
+
+    shmem_quiet();
+}
+
+static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
+    struct ggml_tensor * t = ggml_graph_get_tensor(gf, name);
+    if (t == NULL) {
+        fprintf(stderr, "%s: tensor %s not found\n", __func__, name);
+        return -1;
+    }
+
+    for (int i = 0; i < gf->n_nodes; i++) {
+        if (gf->nodes[i] == t) {
+            return i;
+        }
+    }
+
+    fprintf(stderr, "%s: tensor %s not found in graph (should not happen)\n", __func__, name);
+    return -1;
+}
+
+static void ggml_openshmem_tensor_send(struct ggml_openshmem_context * ctx, struct ggml_tensor * t, int dst_pe) {
+
+    const int64_t symmetric_comm_structure_size =
+        ctx->symmetric_comm_structure_size;
+    uint8_t * dst_symmetric_comm_structure =
+        ((uint8_t*)ctx->symmetric_comm_structure)+(ctx->symmetric_comm_structure_size*ctx->pe);
+    int64_t * dst_symmetric_comm_offset =
+        (int64_t*)(dst_symmetric_comm_structure);
+    int64_t * dst_symmetric_comm_length =
+        ((int64_t*)dst_symmetric_comm_offset)+sizeof(int64_t);
+    uint8_t * dst_symmetric_comm_buffer =
+        ((uint8_t*)dst_symmetric_comm_length)+sizeof(int64_t);
+    long * dst_recv_signal =
+        ctx->recv_signal+dst_pe;
+    long * my_recv_signal =
+        ctx->recv_signal+ctx->pe;
+
+    const int64_t nelements = ggml_nelements(t);
+    int64_t xmt_size = 0;
+
+    switch (t->type) {
+        case GGML_TYPE_I32:
+            xmt_size = nelements * sizeof(int32_t);
+        break;
+        case GGML_TYPE_F32:
+            xmt_size = nelements * sizeof(int32_t);
+        break;
+        default: GGML_ASSERT(false && "not implemented");
+    }
+
+    int64_t count[2] = { (xmt_size / OPENSHMEM_SYMMETRIC_BUFFER_SIZE), 1 };
+    const int64_t total_loop_count = count[ count[0] == 0 ];
+
+    int64_t xmt_amount [2] = { OPENSHMEM_SYMMETRIC_BUFFER_SIZE, xmt_size - (OPENSHMEM_SYMMETRIC_BUFFER_SIZE * count[0]) };
+    int64_t xmt_byte_offset = 0;
+    int64_t xmt_byte_amount = 0;
+ 
+    memcpy(dst_symmetric_comm_offset, &total_loop_count, sizeof(int64_t));
+
+    shmem_put_signal(
+        dst_symmetric_comm_offset,
+        dst_symmetric_comm_offset,
+        sizeof(int64_t),
+        dst_recv_signal,
+        1,
+        SHMEM_SIGNAL_SET,
+        dst_pe
+    );
+
+    shmem_wait_until(
+        my_recv_signal,
+        SHMEM_CMP_EQ,
+        1
+    );
+
+    (*my_recv_signal) = 0;
+
+    xmt_byte_amount = xmt_amount[0 == (total_loop_count-1)];
+
+    for(int32_t i = 0; i < total_loop_count; ++i) {
+        memcpy(dst_symmetric_comm_offset, &xmt_byte_offset, sizeof(int64_t)); 
+        memcpy(dst_symmetric_comm_length, &xmt_byte_amount, sizeof(int64_t)); 
+        memcpy(dst_symmetric_comm_buffer, ((uint8_t*)t->data)+xmt_byte_offset, xmt_byte_amount); 
+
+        shmem_put_signal(
+            dst_symmetric_comm_structure,
+            dst_symmetric_comm_structure,
+            symmetric_comm_structure_size,
+            dst_recv_signal,
+            1,
+            SHMEM_SIGNAL_SET,
+            dst_pe
+        );
+
+        shmem_wait_until(
+            my_recv_signal,
+            SHMEM_CMP_EQ,
+            1
+        );
+
+        (*my_recv_signal) = 0;
+       
+        xmt_byte_offset += xmt_byte_amount;
+        xmt_amount[1] -= xmt_byte_amount;
+        xmt_byte_amount = xmt_amount[i == (total_loop_count-1)];
+    }
+
+    shmem_fence();
+}
+
+static void ggml_openshmem_tensor_recv(struct ggml_openshmem_context * ctx, struct ggml_tensor * t, int src_pe) {
+
+    uint8_t * src_symmetric_comm_structure =
+        ((uint8_t*)ctx->symmetric_comm_structure)+(ctx->symmetric_comm_structure_size*src_pe);
+    int64_t * src_symmetric_comm_offset =
+        (int64_t*)(src_symmetric_comm_structure);
+    int64_t * src_symmetric_comm_length =
+        ((int64_t*)src_symmetric_comm_offset)+sizeof(int64_t);
+    uint8_t * src_symmetric_comm_buffer =
+        ((uint8_t*)src_symmetric_comm_length)+sizeof(int64_t);
+    long * src_recv_signal =
+        ctx->recv_signal+src_pe;
+    long* my_recv_signal =
+        ctx->recv_signal+ctx->pe;
+
+    int64_t total_loop_count = 0;
+
+    shmem_wait_until(my_recv_signal, SHMEM_CMP_EQ, 1);
+    (*my_recv_signal) = 0;
+
+    memcpy(src_symmetric_comm_offset, &total_loop_count, sizeof(int64_t));
+    shmem_put_signal(src_symmetric_comm_structure, src_symmetric_comm_structure, 0, src_recv_signal, 1, SHMEM_SIGNAL_SET, src_pe);
+
+    for(int32_t i = 0; i < total_loop_count; ++i) {
+        shmem_wait_until(my_recv_signal, SHMEM_CMP_EQ, 1);
+        (*my_recv_signal) = 0;
+
+        memcpy(
+            ((uint8_t*)t->data)+(*src_symmetric_comm_offset),
+            src_symmetric_comm_buffer+(*src_symmetric_comm_offset),
+            (*src_symmetric_comm_length)
+        );
+
+        shmem_put_signal(src_symmetric_comm_structure, src_symmetric_comm_structure, 0, src_recv_signal, 1, SHMEM_SIGNAL_SET, src_pe);
+    }
+
+    shmem_fence();
+}
+
+// TODO: there are many improvements that can be done to this implementation
+void ggml_openshmem_graph_compute_pre(
+        struct ggml_openshmem_context * ctx_openshmem,
+             struct ggml_cgraph * gf,
+                            int   n_layers) {
+    const int openshmem_rank = ctx_openshmem->pe;
+    const int openshmem_size = ctx_openshmem->n_pes;
+
+    struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
+    if (inp_tokens == NULL) {
+        fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
+        return;
+    }
+
+    struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
+    if (inp0 == NULL) {
+        fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
+        return;
+    }
+
+    GGML_ASSERT(inp0 == gf->nodes[0]);
+
+    // distribute the compute graph into slices across the MPI nodes
+    //
+    // the main node (0) processes the last layers + the remainder of the compute graph
+    // and is responsible to pass the input tokens to the first node (1)
+    //
+    // node 1:   [(  0) * n_per_node, (  1) * n_per_node)
+    // node 2:   [(  1) * n_per_node, (  2) * n_per_node)
+    // ...
+    // node n-1: [(n-2) * n_per_node, (n-1) * n_per_node)
+    // node 0:   [(n-1) * n_per_node,            n_nodes)
+    //
+    {
+        struct ggml_tensor * input_tokens[2] = { inp_tokens, inp0 };
+
+        if (openshmem_rank > 0) {
+            ggml_openshmem_tensor_recv(ctx_openshmem, input_tokens[openshmem_rank == 1], openshmem_rank-1);
+        }
+        else if (openshmem_size > 1) {
+            // node 0 sends the input tokens to node 1
+            ggml_openshmem_tensor_send(ctx_openshmem, input_tokens[0], 1);
+
+            // recv the output data from the last node
+            ggml_openshmem_tensor_recv(ctx_openshmem, input_tokens[1], openshmem_size - 1);
+        }
+    }
+
+    {
+        const int n_per_node = (n_layers + (openshmem_size - 1)) / openshmem_size;
+
+        const int openshmem_idx = openshmem_rank > 0 ? openshmem_rank - 1 : openshmem_size - 1;
+
+        const int il0 =               (openshmem_idx + 0) * n_per_node;
+        const int il1 = MIN(n_layers, (openshmem_idx + 1) * n_per_node);
+
+        char name_l0[GGML_MAX_NAME];
+        char name_l1[GGML_MAX_NAME];
+
+        snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0);
+        snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
+
+        const int idx_l0 =                ggml_graph_get_node_idx(gf, name_l0);
+        const int idx_l1 = openshmem_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
+
+        if (idx_l0 < 0 || idx_l1 < 0) {
+            fprintf(stderr, "%s: layer input nodes not found\n", __func__);
+            return;
+        }
+
+        // attach the input data to all nodes that need it
+        // TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
+        for (int i = idx_l0; i < idx_l1; i++) {
+            if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
+                gf->nodes[i]->src[0] =  inp0;
+            }
+            if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
+                gf->nodes[i]->src[1] =  inp0;
+            }
+        }
+
+        // TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
+        for (int i = 1; i < idx_l1 - idx_l0; i++) {
+            gf->nodes[i] = gf->nodes[idx_l0 + i];
+            gf->grads[i] = gf->grads[idx_l0 + i];
+        }
+
+        // the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
+        if (openshmem_idx != 0) {
+            gf->nodes[0]->op = GGML_OP_NONE;
+        }
+
+        gf->n_nodes = idx_l1 - idx_l0;
+
+        //fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, openshmem_rank, gf->n_nodes, il0, il1);
+    }
+}
+
+void ggml_openshmem_graph_compute_post(
+        struct ggml_openshmem_context * ctx_openshmem,
+             struct ggml_cgraph * gf,
+                            int   n_layers) {
+    UNUSED(n_layers);
+
+    const int openshmem_rank = ctx_openshmem->pe;
+    const int openshmem_size = ctx_openshmem->n_pes;
+
+    // send the output data to the next node
+    if (openshmem_rank > 0) {
+        ggml_openshmem_tensor_send(ctx_openshmem, gf->nodes[gf->n_nodes - 1], (openshmem_rank + 1) % openshmem_size);
+    }
+}
diff --git a/ggml-oshmem.h b/ggml-oshmem.h
new file mode 100644
index 000000000..fb953fb0f
--- /dev/null
+++ b/ggml-oshmem.h
@@ -0,0 +1,43 @@
+#pragma once
+#ifndef __LLAMA_CPP_GGML_OSHMEM_H__
+#define __LLAMA_CPP_GGML_OSHMEM_H__
+
+struct ggml_context;
+struct ggml_tensor;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_openshmem_context;
+
+void ggml_openshmem_backend_init(void);
+void ggml_openshmem_backend_free(void);
+
+struct ggml_openshmem_context * ggml_openshmem_init(void);
+void ggml_openshmem_free(struct ggml_openshmem_context * ctx);
+
+int ggml_openshmem_rank(struct ggml_openshmem_context * ctx);
+
+void ggml_openshmem_eval_init(
+        struct ggml_openshmem_context * ctx_openshmem,
+                            int * n_tokens,
+                            int * n_past,
+                            int * n_threads);
+
+void ggml_openshmem_graph_compute_pre(
+        struct ggml_openshmem_context * ctx_openshmem,
+             struct ggml_cgraph * gf,
+                            int   n_layers);
+
+void ggml_openshmem_graph_compute_post(
+        struct ggml_openshmem_context * ctx_openshmem,
+             struct ggml_cgraph * gf,
+                            int   n_layers);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/llama.cpp b/llama.cpp
index edd2910b3..46318bed3 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -19,6 +19,9 @@
 #ifdef GGML_USE_MPI
 #  include "ggml-mpi.h"
 #endif
+#ifdef GGML_USE_OPENSHMEM
+#  include "ggml-oshmem.h"
+#endif
 #ifndef QK_K
 #  ifdef GGML_QKK_64
 #    define QK_K 64

From 9604114da019d0934f5869c9e04d86e077faaac4 Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Wed, 20 Dec 2023 23:19:51 -0500
Subject: [PATCH 02/16] added baseline makefile support; fixed several
 compilation warnings

---
 Makefile      | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 ggml-oshmem.c | 12 ++++++------
 ggml-oshmem.h |  2 +-
 3 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 8273f8400..0de6b579e 100644
--- a/Makefile
+++ b/Makefile
@@ -345,6 +345,53 @@ ifdef LLAMA_MPI
 	OBJS        += ggml-mpi.o
 endif # LLAMA_MPI
 
+ifdef LLAMA_OPENSHMEM
+
+        OPENSHMEM_FOUND:=0
+	PKG:=sandia-openshmem
+	REQPKG:=$(shell pkg-config --exists $(PKG) && echo '$(PKG)')
+	ifneq ($(REQPKG),)
+                OPENSHMEM_FOUND:=1
+		OPENSHMEM_CFLAGS:=$(shell pkg-config --cflags-only-other sandia-openshmem)
+		OPENSHMEM_LDFLAGS:=$(shell pkg-config --libs sandia-openshmem)
+	else
+		$(warning '$(PKG)' not found)
+	endif
+
+        ifneq($(OPENSHMEM_FOUND),1)
+		PKG:=osss-ucx
+		REQPKG:=$(shell pkg-config --exists $(PKG) && echo '$(PKG)')
+		ifneq ($(REQPKG),)
+        	        OPENSHMEM_FOUND:=1
+			OPENSHMEM_CFLAGS:=$(shell pkg-config --cflags-only-other osss-ucx)
+			OPENSHMEM_LDFLAGS:=$(shell pkg-config --libs osss-ucx)
+		else
+			$(warning '$(PKG)' not found)
+		endif
+	endif
+
+        ifneq($(OPENSHMEM_FOUND),1)
+		PKG:=oshmem
+		REQPKG:=$(shell pkg-config --exists $(PKG) && echo '$(PKG)')
+		ifneq ($(REQPKG),)
+        	        OPENSHMEM_FOUND:=1
+			OPENSHMEM_CFLAGS:=$(shell oshmem_info --path libdir)
+			OPENSHMEM_LDFLAGS:=$(shell oshmem_info --path incdir)
+		else
+			$(warning '$(PKG)' not found)
+		endif
+	endif
+
+        ifneq($(OPENSHMEM_FOUND),1)
+		$(error '$(PKG)' not found)
+	endif
+
+	MK_CPPFLAGS += -DGGML_USE_OPENSHMEM
+	MK_CFLAGS   += -Wno-cast-qual $(OPENSHMEM_CFLAGS)
+	MK_LDFLAGS += -Wno-cast-qual $(OPENSHMEM_LDFLAGS)
+	OBJS        += ggml-oshmem.o
+endif # LLAMA_OPENSHMEM
+
 ifdef LLAMA_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
diff --git a/ggml-oshmem.c b/ggml-oshmem.c
index 6acc3b5d4..ff8477141 100644
--- a/ggml-oshmem.c
+++ b/ggml-oshmem.c
@@ -20,7 +20,7 @@ struct ggml_openshmem_context {
     int64_t symmetric_buffer_size;
     int64_t symmetric_comm_structure_size;
     uint8_t * symmetric_comm_structure;
-    long * recv_signal;
+    uint64_t * recv_signal;
 };
 
 void ggml_openshmem_backend_init(void) {
@@ -56,7 +56,7 @@ struct ggml_openshmem_context * ggml_openshmem_init(void) {
     /*
      * uint8_t signal_byte[shmem_npes()];
      */
-    ctx->recv_signal = (long*)shmem_calloc(1, ctx->n_pes*sizeof(long));
+    ctx->recv_signal = (uint64_t*)shmem_calloc(1, ctx->n_pes*sizeof(uint64_t));
 
     return ctx;
 }
@@ -115,9 +115,9 @@ static void ggml_openshmem_tensor_send(struct ggml_openshmem_context * ctx, stru
         ((int64_t*)dst_symmetric_comm_offset)+sizeof(int64_t);
     uint8_t * dst_symmetric_comm_buffer =
         ((uint8_t*)dst_symmetric_comm_length)+sizeof(int64_t);
-    long * dst_recv_signal =
+    uint64_t * dst_recv_signal =
         ctx->recv_signal+dst_pe;
-    long * my_recv_signal =
+    uint64_t * my_recv_signal =
         ctx->recv_signal+ctx->pe;
 
     const int64_t nelements = ggml_nelements(t);
@@ -203,9 +203,9 @@ static void ggml_openshmem_tensor_recv(struct ggml_openshmem_context * ctx, stru
         ((int64_t*)src_symmetric_comm_offset)+sizeof(int64_t);
     uint8_t * src_symmetric_comm_buffer =
         ((uint8_t*)src_symmetric_comm_length)+sizeof(int64_t);
-    long * src_recv_signal =
+    uint64_t * src_recv_signal =
         ctx->recv_signal+src_pe;
-    long* my_recv_signal =
+    uint64_t * my_recv_signal =
         ctx->recv_signal+ctx->pe;
 
     int64_t total_loop_count = 0;
diff --git a/ggml-oshmem.h b/ggml-oshmem.h
index fb953fb0f..ea88585ad 100644
--- a/ggml-oshmem.h
+++ b/ggml-oshmem.h
@@ -18,7 +18,7 @@ void ggml_openshmem_backend_free(void);
 struct ggml_openshmem_context * ggml_openshmem_init(void);
 void ggml_openshmem_free(struct ggml_openshmem_context * ctx);
 
-int ggml_openshmem_rank(struct ggml_openshmem_context * ctx);
+int ggml_openshmem_pe(struct ggml_openshmem_context * ctx);
 
 void ggml_openshmem_eval_init(
         struct ggml_openshmem_context * ctx_openshmem,

From 79be614ea59702dd770bec6ff043340f9d2bc236 Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Wed, 20 Dec 2023 23:52:34 -0500
Subject: [PATCH 03/16] updated README.md and Makefile

---
 Makefile  | 63 +++++++++++++++++++++++++++++--------------------------
 README.md | 47 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+), 30 deletions(-)

diff --git a/Makefile b/Makefile
index 0de6b579e..fcb4db976 100644
--- a/Makefile
+++ b/Makefile
@@ -346,49 +346,52 @@ ifdef LLAMA_MPI
 endif # LLAMA_MPI
 
 ifdef LLAMA_OPENSHMEM
-
-        OPENSHMEM_FOUND:=0
-	PKG:=sandia-openshmem
-	REQPKG:=$(shell pkg-config --exists $(PKG) && echo '$(PKG)')
-	ifneq ($(REQPKG),)
-                OPENSHMEM_FOUND:=1
-		OPENSHMEM_CFLAGS:=$(shell pkg-config --cflags-only-other sandia-openshmem)
-		OPENSHMEM_LDFLAGS:=$(shell pkg-config --libs sandia-openshmem)
-	else
-		$(warning '$(PKG)' not found)
-	endif
-
-        ifneq($(OPENSHMEM_FOUND),1)
-		PKG:=osss-ucx
-		REQPKG:=$(shell pkg-config --exists $(PKG) && echo '$(PKG)')
-		ifneq ($(REQPKG),)
-        	        OPENSHMEM_FOUND:=1
-			OPENSHMEM_CFLAGS:=$(shell pkg-config --cflags-only-other osss-ucx)
-			OPENSHMEM_LDFLAGS:=$(shell pkg-config --libs osss-ucx)
+	ifndef OPENSHMEM_FOUND
+		OSHMEM_PKG:=sandia-openshmem
+		OSHMEM_REQPKG:=$(shell pkg-config --exists $(OSHMEM_PKG) && echo '$(OSHMEM_PKG)')
+		ifneq ($(OSHMEM_REQPKG),)
+			OPENSHMEM_FOUND:=1
+			OPENSHMEM_CFLAGS:=$(shell pkg-config --cflags sandia-openshmem)
+			OPENSHMEM_LDFLAGS:=$(shell pkg-config --libs sandia-openshmem)
+			warn := $(warning OpenSHMEM found)
 		else
-			$(warning '$(PKG)' not found)
+			$(warning '$(OSHMEM_PKG)' not found)
 		endif
 	endif
 
-        ifneq($(OPENSHMEM_FOUND),1)
-		PKG:=oshmem
-		REQPKG:=$(shell pkg-config --exists $(PKG) && echo '$(PKG)')
-		ifneq ($(REQPKG),)
-        	        OPENSHMEM_FOUND:=1
+	ifndef OPENSHMEM_FOUND
+		OSHMEM_PKG:=osss-ucx
+		OSHMEM_REQPKG:=$(shell pkg-config --exists $(OSHMEM_PKG) && echo '$(OSHMEM_PKG)')
+		ifneq ($(OSHMEM_REQPKG),)
+			OPENSHMEM_FOUND:=1
+			OPENSHMEM_CFLAGS:=$(shell pkg-config --cflags osss-ucx)
+			OPENSHMEM_LDFLAGS:=$(shell pkg-config --libs osss-ucx)
+			warn := $(warning OpenSHMEM found)
+		else
+			$(warning '$(OSHMEM_PKG)' not found)
+		endif
+	endif
+
+	ifndef OPENSHMEM_FOUND
+		OSHMEM_PKG:=oshmem
+		OSHMEM_REQPKG:=$(shell pkg-config --exists $(OSHMEM_PKG) && echo '$(OSHMEM_PKG)')
+		ifneq ($(OSHMEM_REQPKG),)
+			OPENSHMEM_FOUND:=1
 			OPENSHMEM_CFLAGS:=$(shell oshmem_info --path libdir)
 			OPENSHMEM_LDFLAGS:=$(shell oshmem_info --path incdir)
+			warn := $(warning OpenSHMEM found)
 		else
-			$(warning '$(PKG)' not found)
+			$(warning '$(OSHMEM_PKG)' not found)
 		endif
 	endif
 
-        ifneq($(OPENSHMEM_FOUND),1)
-		$(error '$(PKG)' not found)
+	ifndef OPENSHMEM_FOUND
+		$(error OpenSHMEM not found)
 	endif
 
-	MK_CPPFLAGS += -DGGML_USE_OPENSHMEM
+	MK_CPPFLAGS += -DGGML_USE_OPENSHMEM $(OPENSHMEM_CFLAGS)
 	MK_CFLAGS   += -Wno-cast-qual $(OPENSHMEM_CFLAGS)
-	MK_LDFLAGS += -Wno-cast-qual $(OPENSHMEM_LDFLAGS)
+	MK_LDFLAGS  += -Wno-cast-qual $(OPENSHMEM_LDFLAGS)
 	OBJS        += ggml-oshmem.o
 endif # LLAMA_OPENSHMEM
 
diff --git a/README.md b/README.md
index 01aef2afc..a3ad2530f 100644
--- a/README.md
+++ b/README.md
@@ -335,6 +335,53 @@ Finally, you're ready to run a computation using `mpirun`:
 mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ```
 
+### OpenSHMEM Build
+
+OpenSHMEM lets you distribute the computation over a cluster of machines using a Partitioned Global Address Space (PGAS). Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
+
+First you will need the OpenSHMEM libraries installed on your system. There are 3 options: [OpenMPI's OpenSHMEM](https://www.open-mpi.org), [OSSS-OpenSHMEM](https://github.com/openshmem-org/osss-ucx) and [Sandia-OpenSHMEM](https://github.com/Sandia-OpenSHMEM/SOS). OSSS-OpenSHMEM has a dependency on the [UCX](https://github.com/openucx/ucx) communication library. Sandia-OpenSHMEM can run over udp, [UCX](https://github.com/openucx/ucx), or [libfabric](https://github.com/ofiwg/libfabric). OpenMPI's OpenSHMEM can be installed with a package manager (apt, homebrew, etc). UCX, OSSS-OpenSHMEM, and Sandia-OpenSHMEM can all be installed from source.
+
+Next you will need to build the project with `LLAMA_OPENSHMEM` set to true on all machines; if you're building with `make`, you will also need to specify an OpenSHMEM-capable compiler (when building with CMake, this is configured automatically):
+
+- Using `make`:
+
+  ```bash
+  make CC=oshcc CXX=oshc++ LLAMA_MPI=1
+  ```
+
+- Using `CMake`:
+
+  ```bash
+  cmake -S . -B build -DLLAMA_MPI=ON -DCMAKE_C_COMPILER=oshcc -DCMAKE_CXX_COMPILER=oshc++
+  ```
+
+If you have access to a distributed file system (NFS) it's suggested you copy the programs and weights onto the distributed file system.
+
+Additionally, if you have a cluster with a bulk-synchronous scheduler ie: (Slurm)[https://slurm.schedmd.com] all you need to do is run the program from the distributed file system using the bulk-synchronous scheduler. The following example assumes a slurm cluster. The example additionally assumes  an NFS installation wth the distributed file system mounted with the following path on all machines: `/nfs_path`.
+
+```
+srun -n 2 /nfs_path/main -m /nfs_path/models/7B/ggml-model-q4_0.gguf -n 128
+```
+
+If you do not have access to a cluster with a bulk-synchronous scheduler or a distributed file system, the following instructions will help you stage an installation and run the application. Build the programs, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
+
+Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
+
+Here is an example hostfile:
+
+```
+192.168.0.1:1
+malvolio.local:1
+```
+
+The above will distribute the computation across 1 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive. It is a requirement of OpenSHMEM that the distributed job be performed over a number of machines that is equal to a power of 2.
+
+Finally, you're ready to run a computation using `mpirun`:
+
+```bash
+oshrun -hostfile hostfile -n 2 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
+```
+
 ### BLAS Build
 
 Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:

From 6aad7af26d71ffd823b0255ab795777096ee63ba Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Wed, 20 Dec 2023 23:56:45 -0500
Subject: [PATCH 04/16] improved README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a3ad2530f..716c4e44a 100644
--- a/README.md
+++ b/README.md
@@ -337,7 +337,7 @@ mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 
 ### OpenSHMEM Build
 
-OpenSHMEM lets you distribute the computation over a cluster of machines using a Partitioned Global Address Space (PGAS). Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
+OpenSHMEM lets you distribute the computation over a cluster of machines using a Partitioned Global Address Space (PGAS). OpenSHMEM is a single-sided communication model that tends to yield improved performance for certain applications. LLM prediction is an inherently serial process. This means using OpenSHMEM will not yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
 
 First you will need the OpenSHMEM libraries installed on your system. There are 3 options: [OpenMPI's OpenSHMEM](https://www.open-mpi.org), [OSSS-OpenSHMEM](https://github.com/openshmem-org/osss-ucx) and [Sandia-OpenSHMEM](https://github.com/Sandia-OpenSHMEM/SOS). OSSS-OpenSHMEM has a dependency on the [UCX](https://github.com/openucx/ucx) communication library. Sandia-OpenSHMEM can run over udp, [UCX](https://github.com/openucx/ucx), or [libfabric](https://github.com/ofiwg/libfabric). OpenMPI's OpenSHMEM can be installed with a package manager (apt, homebrew, etc). UCX, OSSS-OpenSHMEM, and Sandia-OpenSHMEM can all be installed from source.
 
@@ -355,7 +355,7 @@ Next you will need to build the project with `LLAMA_OPENSHMEM` set to true on al
   cmake -S . -B build -DLLAMA_MPI=ON -DCMAKE_C_COMPILER=oshcc -DCMAKE_CXX_COMPILER=oshc++
   ```
 
-If you have access to a distributed file system (NFS) it's suggested you copy the programs and weights onto the distributed file system.
+If you have access to a distributed file system (NFS) it's suggested you copy the programs and weights onto the distributed file system. This cluster configration is strongly encouraged.
 
 Additionally, if you have a cluster with a bulk-synchronous scheduler ie: (Slurm)[https://slurm.schedmd.com] all you need to do is run the program from the distributed file system using the bulk-synchronous scheduler. The following example assumes a slurm cluster. The example additionally assumes  an NFS installation wth the distributed file system mounted with the following path on all machines: `/nfs_path`.
 

From ea1331a221fb79058f61bd596627a4e8b9b518ab Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Thu, 21 Dec 2023 00:02:44 -0500
Subject: [PATCH 05/16] added comment

---
 ggml-oshmem.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ggml-oshmem.c b/ggml-oshmem.c
index ff8477141..64c1b8ce4 100644
--- a/ggml-oshmem.c
+++ b/ggml-oshmem.c
@@ -103,6 +103,12 @@ static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
     return -1;
 }
 
+/*
+ * The OpenSHMEM mechanism used in this application reflects a message passing model; this is a byproduct of OpenSHMEM's symmetric memory requirements.
+ * Care has been taken to limit the number of branches made in send/recv and the amount of two-sided communication. Memory consistency maybe an issue
+ * which is why a `shmem_fence` is placed at the end of both send/recv.
+ *
+ */
 static void ggml_openshmem_tensor_send(struct ggml_openshmem_context * ctx, struct ggml_tensor * t, int dst_pe) {
 
     const int64_t symmetric_comm_structure_size =

From 0de3b02353b3a7a80bc400220fff6462f071a001 Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Thu, 21 Dec 2023 10:27:46 -0500
Subject: [PATCH 06/16] updated README.md, fixed small documentation issues;
 modified a variable name

---
 README.md     | 10 ++++------
 ggml-oshmem.c | 18 +++++++++---------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 716c4e44a..8dbacbbac 100644
--- a/README.md
+++ b/README.md
@@ -337,7 +337,7 @@ mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 
 ### OpenSHMEM Build
 
-OpenSHMEM lets you distribute the computation over a cluster of machines using a Partitioned Global Address Space (PGAS). OpenSHMEM is a single-sided communication model that tends to yield improved performance for certain applications. LLM prediction is an inherently serial process. This means using OpenSHMEM will not yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
+OpenSHMEM lets you distribute a computation over a cluster of machines using a Partitioned Global Address Space (PGAS). OpenSHMEM's cluster abstraction is the Parallel-Random-Access-Machine (PRAM). OpenSHMEM's status as a PRAM abstraction means applications are written using the Single-Program-Many-Data (SPMD) style. OpenSHMEM is a shared memory machine abstraction for a cluster. The shared-memory machine abstraction means distributed communications operate like memory copies (memcpy). The receiver does not get a "notification" that communication events have occurred. Senders and recievers can "put" and "get" to remote memory at will. OpenSHMEM is a single-sided communication model that tends to yield improved performance for certain applications. The caveat to that statement is the underlying hardware and software layers. OpenSHMEM operates best when the communication protocol is "fire and forget" (similar to UDP). OpenSHMEM operates best on systems with remote-direct-memory-access (RDMA) enabled network-interface-cards (NICs). OpenSHMEM can work over a commodity ethernet cluster. OpenSHMEM can work on a single machine using a shared memory backend. llama.cpp's OpenSHMEM backend is designed for cluster environments. LLM inference is an inherently serial process. Using OpenSHMEM will not yield any significant [strong scaling](https://hpc-wiki.info/hpc/Scaling#Strong_or_Weak_Scaling) effects. OpenSHMEM it will let you run larger models (over a cluster) than would otherwise fit into the memory (RAM) of a single machine.
 
 First you will need the OpenSHMEM libraries installed on your system. There are 3 options: [OpenMPI's OpenSHMEM](https://www.open-mpi.org), [OSSS-OpenSHMEM](https://github.com/openshmem-org/osss-ucx) and [Sandia-OpenSHMEM](https://github.com/Sandia-OpenSHMEM/SOS). OSSS-OpenSHMEM has a dependency on the [UCX](https://github.com/openucx/ucx) communication library. Sandia-OpenSHMEM can run over udp, [UCX](https://github.com/openucx/ucx), or [libfabric](https://github.com/ofiwg/libfabric). OpenMPI's OpenSHMEM can be installed with a package manager (apt, homebrew, etc). UCX, OSSS-OpenSHMEM, and Sandia-OpenSHMEM can all be installed from source.
 
@@ -346,18 +346,16 @@ Next you will need to build the project with `LLAMA_OPENSHMEM` set to true on al
 - Using `make`:
 
   ```bash
-  make CC=oshcc CXX=oshc++ LLAMA_MPI=1
+  make CC=oshcc CXX=oshc++ LLAMA_OPENSHMEM=1
   ```
 
 - Using `CMake`:
 
   ```bash
-  cmake -S . -B build -DLLAMA_MPI=ON -DCMAKE_C_COMPILER=oshcc -DCMAKE_CXX_COMPILER=oshc++
+  cmake -S . -B build -DCMAKE_C_COMPILER=oshcc -DCMAKE_CXX_COMPILER=oshc++ -DLLAMA_OPENSHMEM=ON 
   ```
 
-If you have access to a distributed file system (NFS) it's suggested you copy the programs and weights onto the distributed file system. This cluster configration is strongly encouraged.
-
-Additionally, if you have a cluster with a bulk-synchronous scheduler ie: (Slurm)[https://slurm.schedmd.com] all you need to do is run the program from the distributed file system using the bulk-synchronous scheduler. The following example assumes a slurm cluster. The example additionally assumes  an NFS installation wth the distributed file system mounted with the following path on all machines: `/nfs_path`.
+It's strongly encouraged that users exercise this backend over a cluster that is configured to operate like a parallel machine. This means users should consider installing and configuring a distributed file system (NFS). Users are also encouraged to install a bulk-synchronous scheduler (ie: (Slurm)[https://slurm.schedmd.com]). Typical parallel machine configurations usually have 2 networks, a network for slurm/NFS and a seperate network for compute. This may not be practical for most users. After compiling llama.cpp w/OpenSHMEM, users will just need to copy the programs and weights onto the distributed file system. In order to run llama.cpp w/OpenSHMEM a user will need to run the program from the distributed file system using a bulk-synchronous scheduler. The following example assumes a slurm cluster is setup and configured. The example asserts an NFS installation is setup, configured, and mounted on each machine with the following path: `/nfs_path`.
 
 ```
 srun -n 2 /nfs_path/main -m /nfs_path/models/7B/ggml-model-q4_0.gguf -n 128
diff --git a/ggml-oshmem.c b/ggml-oshmem.c
index 64c1b8ce4..c49250e02 100644
--- a/ggml-oshmem.c
+++ b/ggml-oshmem.c
@@ -243,7 +243,7 @@ void ggml_openshmem_graph_compute_pre(
         struct ggml_openshmem_context * ctx_openshmem,
              struct ggml_cgraph * gf,
                             int   n_layers) {
-    const int openshmem_rank = ctx_openshmem->pe;
+    const int openshmem_pe = ctx_openshmem->pe;
     const int openshmem_size = ctx_openshmem->n_pes;
 
     struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
@@ -274,8 +274,8 @@ void ggml_openshmem_graph_compute_pre(
     {
         struct ggml_tensor * input_tokens[2] = { inp_tokens, inp0 };
 
-        if (openshmem_rank > 0) {
-            ggml_openshmem_tensor_recv(ctx_openshmem, input_tokens[openshmem_rank == 1], openshmem_rank-1);
+        if (openshmem_pe > 0) {
+            ggml_openshmem_tensor_recv(ctx_openshmem, input_tokens[openshmem_pe == 1], openshmem_pe-1);
         }
         else if (openshmem_size > 1) {
             // node 0 sends the input tokens to node 1
@@ -289,7 +289,7 @@ void ggml_openshmem_graph_compute_pre(
     {
         const int n_per_node = (n_layers + (openshmem_size - 1)) / openshmem_size;
 
-        const int openshmem_idx = openshmem_rank > 0 ? openshmem_rank - 1 : openshmem_size - 1;
+        const int openshmem_idx = openshmem_pe > 0 ? openshmem_pe - 1 : openshmem_size - 1;
 
         const int il0 =               (openshmem_idx + 0) * n_per_node;
         const int il1 = MIN(n_layers, (openshmem_idx + 1) * n_per_node);
@@ -301,7 +301,7 @@ void ggml_openshmem_graph_compute_pre(
         snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
 
         const int idx_l0 =                ggml_graph_get_node_idx(gf, name_l0);
-        const int idx_l1 = openshmem_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
+        const int idx_l1 = openshmem_pe > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
 
         if (idx_l0 < 0 || idx_l1 < 0) {
             fprintf(stderr, "%s: layer input nodes not found\n", __func__);
@@ -332,7 +332,7 @@ void ggml_openshmem_graph_compute_pre(
 
         gf->n_nodes = idx_l1 - idx_l0;
 
-        //fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, openshmem_rank, gf->n_nodes, il0, il1);
+        //fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, openshmem_pe, gf->n_nodes, il0, il1);
     }
 }
 
@@ -342,11 +342,11 @@ void ggml_openshmem_graph_compute_post(
                             int   n_layers) {
     UNUSED(n_layers);
 
-    const int openshmem_rank = ctx_openshmem->pe;
+    const int openshmem_pe = ctx_openshmem->pe;
     const int openshmem_size = ctx_openshmem->n_pes;
 
     // send the output data to the next node
-    if (openshmem_rank > 0) {
-        ggml_openshmem_tensor_send(ctx_openshmem, gf->nodes[gf->n_nodes - 1], (openshmem_rank + 1) % openshmem_size);
+    if (openshmem_pe > 0) {
+        ggml_openshmem_tensor_send(ctx_openshmem, gf->nodes[gf->n_nodes - 1], (openshmem_pe + 1) % openshmem_size);
     }
 }

From fa49c150d0e6b9968f39c0087667b7c243711ba7 Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Thu, 21 Dec 2023 10:52:59 -0500
Subject: [PATCH 07/16] fixed small segmentation bug; switched to using
 type-sensitive openshmem calls

---
 ggml-oshmem.c | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/ggml-oshmem.c b/ggml-oshmem.c
index c49250e02..4c4923979 100644
--- a/ggml-oshmem.c
+++ b/ggml-oshmem.c
@@ -70,18 +70,29 @@ int ggml_openshmem_pe(struct ggml_openshmem_context * ctx) {
 }
 
 void ggml_openshmem_eval_init(
-        struct ggml_openshmem_context * ctx_openshmem,
+        struct ggml_openshmem_context * ctx,
         int * n_tokens,
         int * n_past,
         int * n_threads) {
-    UNUSED(ctx_openshmem);
+    UNUSED(ctx);
+
+    uint8_t * dst_symmetric_comm_structure =
+        ((uint8_t*)ctx->symmetric_comm_structure)+(ctx->symmetric_comm_structure_size*ctx->pe);
+    int64_t * dst_symmetric_comm_offset =
+        (int64_t*)(dst_symmetric_comm_structure);
 
     // synchronize the worker node parameters with the root node
     shmem_barrier_all();
 
-    shmem_broadcast(SHMEM_TEAM_WORLD, n_tokens, n_tokens, 1, 0);
-    shmem_broadcast(SHMEM_TEAM_WORLD, n_past, n_tokens, 1, 0);
-    shmem_broadcast(SHMEM_TEAM_WORLD, n_threads, n_tokens, 1, 0);
+    memcpy(dst_symmetric_comm_offset, n_tokens, sizeof(int));
+    memcpy(dst_symmetric_comm_offset+sizeof(int), n_past, sizeof(int));
+    memcpy(dst_symmetric_comm_offset+sizeof(int)+sizeof(int), n_past, sizeof(int));
+
+    shmem_int32_broadcast(SHMEM_TEAM_WORLD, (int*)dst_symmetric_comm_offset, (int*)dst_symmetric_comm_offset, 3, 0);
+
+    memcpy(n_tokens, dst_symmetric_comm_offset, sizeof(int));
+    memcpy(n_past, dst_symmetric_comm_offset+sizeof(int), sizeof(int));
+    memcpy(n_threads, dst_symmetric_comm_offset+sizeof(int)+sizeof(int), sizeof(int));
 
     shmem_quiet();
 }
@@ -139,16 +150,16 @@ static void ggml_openshmem_tensor_send(struct ggml_openshmem_context * ctx, stru
         default: GGML_ASSERT(false && "not implemented");
     }
 
-    int64_t count[2] = { (xmt_size / OPENSHMEM_SYMMETRIC_BUFFER_SIZE), 1 };
-    const int64_t total_loop_count = count[ count[0] == 0 ];
-
-    int64_t xmt_amount [2] = { OPENSHMEM_SYMMETRIC_BUFFER_SIZE, xmt_size - (OPENSHMEM_SYMMETRIC_BUFFER_SIZE * count[0]) };
+    int64_t init_segments = (xmt_size / OPENSHMEM_SYMMETRIC_BUFFER_SIZE);
+    int64_t xmt_amount [2] = { OPENSHMEM_SYMMETRIC_BUFFER_SIZE, xmt_size - (OPENSHMEM_SYMMETRIC_BUFFER_SIZE * init_segments) };
     int64_t xmt_byte_offset = 0;
     int64_t xmt_byte_amount = 0;
  
+    const int64_t total_loop_count = init_segments + !( xmt_amount[1] < 1);
+
     memcpy(dst_symmetric_comm_offset, &total_loop_count, sizeof(int64_t));
 
-    shmem_put_signal(
+    shmem_int64_put_signal(
         dst_symmetric_comm_offset,
         dst_symmetric_comm_offset,
         sizeof(int64_t),
@@ -173,7 +184,7 @@ static void ggml_openshmem_tensor_send(struct ggml_openshmem_context * ctx, stru
         memcpy(dst_symmetric_comm_length, &xmt_byte_amount, sizeof(int64_t)); 
         memcpy(dst_symmetric_comm_buffer, ((uint8_t*)t->data)+xmt_byte_offset, xmt_byte_amount); 
 
-        shmem_put_signal(
+        shmem_uint8_put_signal(
             dst_symmetric_comm_structure,
             dst_symmetric_comm_structure,
             symmetric_comm_structure_size,
@@ -220,7 +231,7 @@ static void ggml_openshmem_tensor_recv(struct ggml_openshmem_context * ctx, stru
     (*my_recv_signal) = 0;
 
     memcpy(src_symmetric_comm_offset, &total_loop_count, sizeof(int64_t));
-    shmem_put_signal(src_symmetric_comm_structure, src_symmetric_comm_structure, 0, src_recv_signal, 1, SHMEM_SIGNAL_SET, src_pe);
+    shmem_uint8_put_signal(src_symmetric_comm_structure, src_symmetric_comm_structure, 0, src_recv_signal, 1, SHMEM_SIGNAL_SET, src_pe);
 
     for(int32_t i = 0; i < total_loop_count; ++i) {
         shmem_wait_until(my_recv_signal, SHMEM_CMP_EQ, 1);
@@ -232,7 +243,7 @@ static void ggml_openshmem_tensor_recv(struct ggml_openshmem_context * ctx, stru
             (*src_symmetric_comm_length)
         );
 
-        shmem_put_signal(src_symmetric_comm_structure, src_symmetric_comm_structure, 0, src_recv_signal, 1, SHMEM_SIGNAL_SET, src_pe);
+        shmem_uint8_put_signal(src_symmetric_comm_structure, src_symmetric_comm_structure, 0, src_recv_signal, 1, SHMEM_SIGNAL_SET, src_pe);
     }
 
     shmem_fence();

From 6d08baccea364a92231e2879b410254e1303d662 Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Thu, 21 Dec 2023 11:05:19 -0500
Subject: [PATCH 08/16] added explicit casting; fixed small memcpy issue

---
 ggml-oshmem.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ggml-oshmem.c b/ggml-oshmem.c
index 4c4923979..594ac6b4a 100644
--- a/ggml-oshmem.c
+++ b/ggml-oshmem.c
@@ -84,15 +84,15 @@ void ggml_openshmem_eval_init(
     // synchronize the worker node parameters with the root node
     shmem_barrier_all();
 
-    memcpy(dst_symmetric_comm_offset, n_tokens, sizeof(int));
-    memcpy(dst_symmetric_comm_offset+sizeof(int), n_past, sizeof(int));
-    memcpy(dst_symmetric_comm_offset+sizeof(int)+sizeof(int), n_past, sizeof(int));
+    memcpy((int*)dst_symmetric_comm_offset, n_tokens, sizeof(int));
+    memcpy(((int*)dst_symmetric_comm_offset)+sizeof(int), n_past, sizeof(int));
+    memcpy(((int*)dst_symmetric_comm_offset)+sizeof(int)+sizeof(int), n_threads, sizeof(int));
 
     shmem_int32_broadcast(SHMEM_TEAM_WORLD, (int*)dst_symmetric_comm_offset, (int*)dst_symmetric_comm_offset, 3, 0);
 
-    memcpy(n_tokens, dst_symmetric_comm_offset, sizeof(int));
-    memcpy(n_past, dst_symmetric_comm_offset+sizeof(int), sizeof(int));
-    memcpy(n_threads, dst_symmetric_comm_offset+sizeof(int)+sizeof(int), sizeof(int));
+    memcpy(n_tokens, ((int*)dst_symmetric_comm_offset), sizeof(int));
+    memcpy(n_past, ((int*)dst_symmetric_comm_offset)+sizeof(int), sizeof(int));
+    memcpy(n_threads, ((int*)dst_symmetric_comm_offset)+sizeof(int)+sizeof(int), sizeof(int));
 
     shmem_quiet();
 }

From ecf9c7983c593e49cdb72037771924f6693853c1 Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Thu, 21 Dec 2023 11:10:28 -0500
Subject: [PATCH 09/16] did some formatting

---
 ggml-oshmem.c | 46 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 39 insertions(+), 7 deletions(-)

diff --git a/ggml-oshmem.c b/ggml-oshmem.c
index 594ac6b4a..92e93e61a 100644
--- a/ggml-oshmem.c
+++ b/ggml-oshmem.c
@@ -151,13 +151,21 @@ static void ggml_openshmem_tensor_send(struct ggml_openshmem_context * ctx, stru
     }
 
     int64_t init_segments = (xmt_size / OPENSHMEM_SYMMETRIC_BUFFER_SIZE);
-    int64_t xmt_amount [2] = { OPENSHMEM_SYMMETRIC_BUFFER_SIZE, xmt_size - (OPENSHMEM_SYMMETRIC_BUFFER_SIZE * init_segments) };
+    int64_t xmt_amount [2] = {
+        OPENSHMEM_SYMMETRIC_BUFFER_SIZE,
+        xmt_size - (OPENSHMEM_SYMMETRIC_BUFFER_SIZE * init_segments)
+    };
     int64_t xmt_byte_offset = 0;
     int64_t xmt_byte_amount = 0;
  
-    const int64_t total_loop_count = init_segments + !( xmt_amount[1] < 1);
+    const int64_t total_loop_count =
+        init_segments + !( xmt_amount[1] < 1);
 
-    memcpy(dst_symmetric_comm_offset, &total_loop_count, sizeof(int64_t));
+    memcpy(
+        dst_symmetric_comm_offset,
+        &total_loop_count,
+        sizeof(int64_t)
+    );
 
     shmem_int64_put_signal(
         dst_symmetric_comm_offset,
@@ -227,14 +235,30 @@ static void ggml_openshmem_tensor_recv(struct ggml_openshmem_context * ctx, stru
 
     int64_t total_loop_count = 0;
 
-    shmem_wait_until(my_recv_signal, SHMEM_CMP_EQ, 1);
+    shmem_wait_until(
+        my_recv_signal,
+        SHMEM_CMP_EQ,
+        1
+    );
     (*my_recv_signal) = 0;
 
     memcpy(src_symmetric_comm_offset, &total_loop_count, sizeof(int64_t));
-    shmem_uint8_put_signal(src_symmetric_comm_structure, src_symmetric_comm_structure, 0, src_recv_signal, 1, SHMEM_SIGNAL_SET, src_pe);
+    shmem_uint8_put_signal(
+        src_symmetric_comm_structure,
+        src_symmetric_comm_structure,
+        0,
+        src_recv_signal,
+        1,
+        SHMEM_SIGNAL_SET,
+        src_pe
+    );
 
     for(int32_t i = 0; i < total_loop_count; ++i) {
-        shmem_wait_until(my_recv_signal, SHMEM_CMP_EQ, 1);
+        shmem_wait_until(
+            my_recv_signal,
+            SHMEM_CMP_EQ,
+            1
+        );
         (*my_recv_signal) = 0;
 
         memcpy(
@@ -243,7 +267,15 @@ static void ggml_openshmem_tensor_recv(struct ggml_openshmem_context * ctx, stru
             (*src_symmetric_comm_length)
         );
 
-        shmem_uint8_put_signal(src_symmetric_comm_structure, src_symmetric_comm_structure, 0, src_recv_signal, 1, SHMEM_SIGNAL_SET, src_pe);
+        shmem_uint8_put_signal(
+            src_symmetric_comm_structure,
+            src_symmetric_comm_structure,
+            0,
+            src_recv_signal,
+            1,
+            SHMEM_SIGNAL_SET,
+            src_pe
+        );
     }
 
     shmem_fence();

From 46bcbf380545e5075586b772ad3ebc86303c9e5e Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Thu, 21 Dec 2023 11:17:13 -0500
Subject: [PATCH 10/16] fixed formatting

---
 ggml-oshmem.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-oshmem.c b/ggml-oshmem.c
index 92e93e61a..0ff2885ae 100644
--- a/ggml-oshmem.c
+++ b/ggml-oshmem.c
@@ -284,8 +284,8 @@ static void ggml_openshmem_tensor_recv(struct ggml_openshmem_context * ctx, stru
 // TODO: there are many improvements that can be done to this implementation
 void ggml_openshmem_graph_compute_pre(
         struct ggml_openshmem_context * ctx_openshmem,
-             struct ggml_cgraph * gf,
-                            int   n_layers) {
+        struct ggml_cgraph * gf,
+        int   n_layers) {
     const int openshmem_pe = ctx_openshmem->pe;
     const int openshmem_size = ctx_openshmem->n_pes;
 

From 3f2769bf261b6b9b5f36f9a180546d3a3a4ee2c1 Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Thu, 21 Dec 2023 18:25:22 -0500
Subject: [PATCH 11/16] added correct use of shmem_free

---
 ggml-oshmem.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ggml-oshmem.c b/ggml-oshmem.c
index 0ff2885ae..86e759693 100644
--- a/ggml-oshmem.c
+++ b/ggml-oshmem.c
@@ -32,7 +32,8 @@ void ggml_openshmem_backend_free(void) {
 }
 
 struct ggml_openshmem_context * ggml_openshmem_init(void) {
-    struct ggml_openshmem_context * ctx = calloc(1, sizeof(struct ggml_openshmem_context));
+    struct ggml_openshmem_context * ctx =
+        (struct ggml_openshmem_context *)calloc(1, sizeof(struct ggml_openshmem_context));
 
     ctx->pe = shmem_my_pe(); 
     ctx->n_pes = shmem_n_pes();
@@ -62,6 +63,8 @@ struct ggml_openshmem_context * ggml_openshmem_init(void) {
 }
 
 void ggml_openshmem_free(struct ggml_openshmem_context * ctx) {
+    shmem_free(ctx->symmetric_comm_structure);
+    shmem_free(ctx->recv_signal);
     free(ctx);
 }
 

From c8d67705feed1ecdb64ffad605ab7f5bbb914525 Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Thu, 21 Dec 2023 18:58:46 -0500
Subject: [PATCH 12/16] reduced the number of shmem_calloc calls

---
 ggml-oshmem.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/ggml-oshmem.c b/ggml-oshmem.c
index 86e759693..0cdf4bdc5 100644
--- a/ggml-oshmem.c
+++ b/ggml-oshmem.c
@@ -51,20 +51,14 @@ struct ggml_openshmem_context * ggml_openshmem_init(void) {
      *
      */
     ctx->symmetric_buffer_size = OPENSHMEM_SYMMETRIC_BUFFER_SIZE;
-    ctx->symmetric_comm_structure_size = OPENSHMEM_SYMMETRIC_BUFFER_SIZE + sizeof(int64_t) + sizeof(int64_t);
+    ctx->symmetric_comm_structure_size = OPENSHMEM_SYMMETRIC_BUFFER_SIZE + sizeof(int64_t) + sizeof(int64_t) + sizeof(uint64_t) + sizeof(uint64_t);
     ctx->symmetric_comm_structure = (uint8_t*)shmem_calloc(1, ctx->n_pes*ctx->symmetric_comm_structure_size);
 
-    /*
-     * uint8_t signal_byte[shmem_npes()];
-     */
-    ctx->recv_signal = (uint64_t*)shmem_calloc(1, ctx->n_pes*sizeof(uint64_t));
-
     return ctx;
 }
 
 void ggml_openshmem_free(struct ggml_openshmem_context * ctx) {
     shmem_free(ctx->symmetric_comm_structure);
-    shmem_free(ctx->recv_signal);
     free(ctx);
 }
 
@@ -127,18 +121,20 @@ static void ggml_openshmem_tensor_send(struct ggml_openshmem_context * ctx, stru
 
     const int64_t symmetric_comm_structure_size =
         ctx->symmetric_comm_structure_size;
+
+    uint64_t * my_recv_signal =
+        ((uint64_t*)ctx->symmetric_comm_structure)+(ctx->symmetric_comm_structure_size*ctx->pe);
+    uint64_t * dst_recv_signal =
+        ((uint64_t*)ctx->symmetric_comm_structure)+(ctx->symmetric_comm_structure_size*ctx->pe)+sizeof(uint64_t);
+
     uint8_t * dst_symmetric_comm_structure =
-        ((uint8_t*)ctx->symmetric_comm_structure)+(ctx->symmetric_comm_structure_size*ctx->pe);
+        ((uint8_t*)ctx->symmetric_comm_structure)+(ctx->symmetric_comm_structure_size*ctx->pe)+sizeof(uint64_t)+sizeof(uint64_t);
     int64_t * dst_symmetric_comm_offset =
         (int64_t*)(dst_symmetric_comm_structure);
     int64_t * dst_symmetric_comm_length =
         ((int64_t*)dst_symmetric_comm_offset)+sizeof(int64_t);
     uint8_t * dst_symmetric_comm_buffer =
         ((uint8_t*)dst_symmetric_comm_length)+sizeof(int64_t);
-    uint64_t * dst_recv_signal =
-        ctx->recv_signal+dst_pe;
-    uint64_t * my_recv_signal =
-        ctx->recv_signal+ctx->pe;
 
     const int64_t nelements = ggml_nelements(t);
     int64_t xmt_size = 0;
@@ -223,18 +219,22 @@ static void ggml_openshmem_tensor_send(struct ggml_openshmem_context * ctx, stru
 
 static void ggml_openshmem_tensor_recv(struct ggml_openshmem_context * ctx, struct ggml_tensor * t, int src_pe) {
 
+    const int64_t symmetric_comm_structure_size =
+        ctx->symmetric_comm_structure_size;
+
+    uint64_t * src_recv_signal =
+        ((uint64_t*)ctx->symmetric_comm_structure)+(symmetric_comm_structure_size*src_pe);
+    uint64_t * my_recv_signal =
+        ((uint64_t*)ctx->symmetric_comm_structure)+(symmetric_comm_structure_size*src_pe)+sizeof(uint64_t);
+
     uint8_t * src_symmetric_comm_structure =
-        ((uint8_t*)ctx->symmetric_comm_structure)+(ctx->symmetric_comm_structure_size*src_pe);
+        ((uint8_t*)ctx->symmetric_comm_structure)+(symmetric_comm_structure_size*src_pe)+sizeof(uint64_t)+sizeof(uint64_t);
     int64_t * src_symmetric_comm_offset =
         (int64_t*)(src_symmetric_comm_structure);
     int64_t * src_symmetric_comm_length =
         ((int64_t*)src_symmetric_comm_offset)+sizeof(int64_t);
     uint8_t * src_symmetric_comm_buffer =
         ((uint8_t*)src_symmetric_comm_length)+sizeof(int64_t);
-    uint64_t * src_recv_signal =
-        ctx->recv_signal+src_pe;
-    uint64_t * my_recv_signal =
-        ctx->recv_signal+ctx->pe;
 
     int64_t total_loop_count = 0;
 

From d05fcad5d12bc76013411efc2da721cd26a374a8 Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Thu, 21 Dec 2023 19:35:33 -0500
Subject: [PATCH 13/16] cleaned up pointer arithmetic; rm'd a member variable
 of the oshmem context struct

---
 ggml-oshmem.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/ggml-oshmem.c b/ggml-oshmem.c
index 0cdf4bdc5..e9385edef 100644
--- a/ggml-oshmem.c
+++ b/ggml-oshmem.c
@@ -20,7 +20,6 @@ struct ggml_openshmem_context {
     int64_t symmetric_buffer_size;
     int64_t symmetric_comm_structure_size;
     uint8_t * symmetric_comm_structure;
-    uint64_t * recv_signal;
 };
 
 void ggml_openshmem_backend_init(void) {
@@ -74,7 +73,7 @@ void ggml_openshmem_eval_init(
     UNUSED(ctx);
 
     uint8_t * dst_symmetric_comm_structure =
-        ((uint8_t*)ctx->symmetric_comm_structure)+(ctx->symmetric_comm_structure_size*ctx->pe);
+        ((uint8_t*)ctx->symmetric_comm_structure)+(ctx->symmetric_comm_structure_size*ctx->pe)+sizeof(uint64_t)+sizeof(uint64_t);
     int64_t * dst_symmetric_comm_offset =
         (int64_t*)(dst_symmetric_comm_structure);
 
@@ -123,12 +122,12 @@ static void ggml_openshmem_tensor_send(struct ggml_openshmem_context * ctx, stru
         ctx->symmetric_comm_structure_size;
 
     uint64_t * my_recv_signal =
-        ((uint64_t*)ctx->symmetric_comm_structure)+(ctx->symmetric_comm_structure_size*ctx->pe);
+        ((uint64_t*)ctx->symmetric_comm_structure)+(symmetric_comm_structure_size*ctx->pe);
     uint64_t * dst_recv_signal =
-        ((uint64_t*)ctx->symmetric_comm_structure)+(ctx->symmetric_comm_structure_size*ctx->pe)+sizeof(uint64_t);
+        ((uint64_t*)my_recv_signal)+sizeof(uint64_t);
 
     uint8_t * dst_symmetric_comm_structure =
-        ((uint8_t*)ctx->symmetric_comm_structure)+(ctx->symmetric_comm_structure_size*ctx->pe)+sizeof(uint64_t)+sizeof(uint64_t);
+        ((uint8_t*)dst_recv_signal)+sizeof(uint64_t);
     int64_t * dst_symmetric_comm_offset =
         (int64_t*)(dst_symmetric_comm_structure);
     int64_t * dst_symmetric_comm_length =
@@ -225,10 +224,10 @@ static void ggml_openshmem_tensor_recv(struct ggml_openshmem_context * ctx, stru
     uint64_t * src_recv_signal =
         ((uint64_t*)ctx->symmetric_comm_structure)+(symmetric_comm_structure_size*src_pe);
     uint64_t * my_recv_signal =
-        ((uint64_t*)ctx->symmetric_comm_structure)+(symmetric_comm_structure_size*src_pe)+sizeof(uint64_t);
+        ((uint64_t*)src_recv_signal)+sizeof(uint64_t);
 
     uint8_t * src_symmetric_comm_structure =
-        ((uint8_t*)ctx->symmetric_comm_structure)+(symmetric_comm_structure_size*src_pe)+sizeof(uint64_t)+sizeof(uint64_t);
+        ((uint8_t*)my_recv_signal)+sizeof(uint64_t);
     int64_t * src_symmetric_comm_offset =
         (int64_t*)(src_symmetric_comm_structure);
     int64_t * src_symmetric_comm_length =

From eb0f775950a064f9ebe184b2d312b57f50931eb1 Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Thu, 21 Dec 2023 21:20:46 -0500
Subject: [PATCH 14/16] cleaned up pointer arithmetic

---
 ggml-oshmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-oshmem.c b/ggml-oshmem.c
index e9385edef..a3d363937 100644
--- a/ggml-oshmem.c
+++ b/ggml-oshmem.c
@@ -244,7 +244,7 @@ static void ggml_openshmem_tensor_recv(struct ggml_openshmem_context * ctx, stru
     );
     (*my_recv_signal) = 0;
 
-    memcpy(src_symmetric_comm_offset, &total_loop_count, sizeof(int64_t));
+    memcpy(&total_loop_count, src_symmetric_comm_offset, sizeof(int64_t));
     shmem_uint8_put_signal(
         src_symmetric_comm_structure,
         src_symmetric_comm_structure,

From 71f4c9633134b6d46e8a85bb202274bb27483799 Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Fri, 22 Dec 2023 21:07:26 -0500
Subject: [PATCH 15/16] added oshmem backend to llama.cpp

---
 llama.cpp | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 46318bed3..8e268fa18 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1549,6 +1549,11 @@ struct llama_context {
 #ifdef GGML_USE_MPI
     ggml_mpi_context * ctx_mpi = NULL;
 #endif
+
+#ifdef GGML_USE_OPENSHMEM
+    ggml_openshmem_context * ctx_oshmem = NULL;
+#endif
+
 };
 
 //
@@ -6352,6 +6357,12 @@ static int llama_decode_internal(
     ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
 #endif
 
+#if GGML_USE_OPENSHMEM
+    const int64_t n_layer = hparams.n_layer;
+    ggml_openshmem_graph_compute_pre(lctx.ctx_oshmem, gf, n_layer);
+#endif
+
+
 #ifdef GGML_USE_METAL
     if (lctx.ctx_metal) {
         ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
@@ -6367,6 +6378,10 @@ static int llama_decode_internal(
     ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
 #endif
 
+#if GGML_USE_OPENSHEM
+    ggml_openshmem_graph_compute_post(lctx.ctx_oshmem, gf, n_layer);
+#endif
+
     // update the kv ring buffer
     {
         if (kv_self.has_shift) {
@@ -9256,12 +9271,21 @@ void llama_backend_init(bool numa) {
 #ifdef GGML_USE_MPI
     ggml_mpi_backend_init();
 #endif
+
+#ifdef GGML_USE_OPENSHMEM
+    ggml_openshmem_backend_init();
+#endif
+
 }
 
 void llama_backend_free(void) {
 #ifdef GGML_USE_MPI
     ggml_mpi_backend_free();
 #endif
+#ifdef GGML_USE_OPENSHMEM
+    ggml_openshmem_backend_free();
+#endif
+
 }
 
 int64_t llama_time_us(void) {
@@ -9524,6 +9548,20 @@ struct llama_context * llama_new_context_with_model(
     }
 #endif
 
+#ifdef GGML_USE_OPENSHMEM
+    ctx->ctx_oshmem = ggml_openshmem_init();
+
+    if (ggml_openshmem_pe(ctx->ctx_oshmem) > 0) {
+        // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
+        // TODO: needs fix after #3228
+        GGML_ASSERT(false && "not implemented");
+        //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
+        //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
+        llama_backend_free();
+        exit(1);
+    }
+#endif
+
     return ctx;
 }
 

From d3f155733f98770c38e72fbab0d2a6caa5cd0186 Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Sat, 23 Dec 2023 20:49:12 -0500
Subject: [PATCH 16/16] updated thread support

---
 ggml-oshmem.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml-oshmem.c b/ggml-oshmem.c
index a3d363937..1a1b85dea 100644
--- a/ggml-oshmem.c
+++ b/ggml-oshmem.c
@@ -23,7 +23,8 @@ struct ggml_openshmem_context {
 };
 
 void ggml_openshmem_backend_init(void) {
-    shmem_init();
+    int provided = 0;
+    shmem_init_thread(SHMEM_THREAD_MULTIPLE, &provided);
 }
 
 void ggml_openshmem_backend_free(void) {