From 1262e7ed13ac197c944f15e1ddb083cb4f36cf65 Mon Sep 17 00:00:00 2001
From: DavidKorczynski <david@adalogics.com>
Date: Mon, 12 Aug 2024 13:36:41 +0100
Subject: [PATCH 1/5] grammar-parser : fix possible null-deref (#9004)

Fixes: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=70680

Signed-off-by: David Korczynski <david@adalogics.com>
---
 common/grammar-parser.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp
index a518b766d..438452eab 100644
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -369,6 +369,9 @@ namespace grammar_parser {
             }
             // Validate the state to ensure that all rules are defined
             for (const auto & rule : state.rules) {
+                if (rule.empty()) {
+                    throw std::runtime_error("Undefined rule");
+                }
                 for (const auto & elem : rule) {
                     if (elem.type == LLAMA_GRETYPE_RULE_REF) {
                         // Ensure that the rule at that location exists

From 84eb2f4fad28ceadd415a4e775320c983f4d9a7d Mon Sep 17 00:00:00 2001
From: Frank Mai <thxcode0824@gmail.com>
Date: Mon, 12 Aug 2024 20:45:50 +0800
Subject: [PATCH 2/5] docs: introduce gpustack and gguf-parser (#8873)

* readme: introduce gpustack

GPUStack is an open-source GPU cluster manager for running large
language models, which uses llama.cpp as the backend.

Signed-off-by: thxCode <thxcode0824@gmail.com>

* readme: introduce gguf-parser

GGUF Parser is a tool to review/check the GGUF file and estimate the
memory usage without downloading the whole model.

Signed-off-by: thxCode <thxcode0824@gmail.com>

---------

Signed-off-by: thxCode <thxcode0824@gmail.com>
---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 1283f6805..7f48fde6e 100644
--- a/README.md
+++ b/README.md
@@ -186,10 +186,12 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 
 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
+- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
 
 **Infrastructure:**
 
 - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
+- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
 
 **Games:**
 - [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.

From 0fd93cdef5e583aa980b3c0d693c0d207f0787a7 Mon Sep 17 00:00:00 2001
From: Nico Bosshard <nico@bosshome.ch>
Date: Mon, 12 Aug 2024 17:13:59 +0200
Subject: [PATCH 3/5] llama : model-based max number of graph nodes calculation
 (#8970)

* llama : model-based max number of graph nodes calculation

* Update src/llama.cpp

---------

Co-authored-by: slaren <slarengh@gmail.com>
---
 src/llama.cpp | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index aaf8db496..7f2f00031 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3575,13 +3575,8 @@ namespace GGUFMeta {
 
 using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
 
-// TODO: update when needed or think of some clever automatic way to do this
-static size_t llama_model_max_nodes(const llama_model & /*model*/) {
-    //if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B
-    //    return 32768;
-    //}
-
-    return 8192;
+static size_t llama_model_max_nodes(const llama_model & model) {
+    return std::max<size_t>(8192, model.tensors_by_name.size()*5);
 }
 
 struct llama_model_loader {

From 1f67436c5ee6f4c99e71a8518bdfc214c27ce934 Mon Sep 17 00:00:00 2001
From: Radoslav Gerganov <rgerganov@gmail.com>
Date: Mon, 12 Aug 2024 19:17:03 +0300
Subject: [PATCH 4/5] ci : enable RPC in all of the released builds (#9006)

ref: #8912
---
 .github/workflows/build.yml | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b9246659a..74b5d4f69 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -47,7 +47,7 @@ jobs:
           sysctl -a
           mkdir build
           cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
           cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
@@ -105,7 +105,7 @@ jobs:
           sysctl -a
           # Metal is disabled due to intermittent failures with Github runners not having a GPU:
           # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
+          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
@@ -222,7 +222,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
           cmake --build . --config Release -j $(nproc)
 
       - name: Test
@@ -696,22 +696,20 @@ jobs:
     strategy:
       matrix:
         include:
-          - build: 'rpc-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'noavx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
           - build: 'avx2-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'avx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
           - build: 'avx512-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'openblas-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
           - build: 'kompute-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'vulkan-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'llvm-arm64'
             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'msvc-arm64'

From fc4ca27b25464a11b3b86c9dbb5b6ed6065965c2 Mon Sep 17 00:00:00 2001
From: Diogo Teles Sant'Anna <diogoteles@google.com>
Date: Mon, 12 Aug 2024 13:28:23 -0300
Subject: [PATCH 5/5] ci : fix github workflow vulnerable to script injection
 (#9008)

Signed-off-by: Diogo Teles Sant'Anna <diogoteles@google.com>
---
 .github/workflows/bench.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index eb69b82c4..56d22bc0c 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -129,6 +129,8 @@ jobs:
 
       - name: Server bench
         id: server_bench
+        env:
+            HEAD_REF: ${{ github.head_ref || github.ref_name }}
         run: |
           set -eux
 
@@ -137,7 +139,7 @@ jobs:
           python bench.py \
               --runner-label ${{ env.RUNNER_LABEL }} \
               --name ${{ github.job }} \
-              --branch ${{ github.head_ref || github.ref_name }} \
+              --branch $HEAD_REF \
               --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
               --scenario script.js \
               --duration ${{ github.event.inputs.duration || env.DURATION }} \