From 2af23d30434a677c6416812eea52ccc0af65119c Mon Sep 17 00:00:00 2001
From: Bernat Vadell <hounter.caza@gmail.com>
Date: Fri, 17 Mar 2023 10:47:06 +0100
Subject: [PATCH 1/8] =?UTF-8?q?=F0=9F=9A=80=20Dockerize=20llamacpp=20(#132?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat: dockerize llamacpp

* feat: split build & runtime stages

* split dockerfile into main & tools

* add quantize into tool docker image

* Update .devops/tools.sh

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* add docker action pipeline

* change CI to publish at github docker registry

* fix name runs-on macOS-latest is macos-latest (lowercase)

* include docker versioned images

* fix github action docker

* fix docker.yml

* feat: include all-in-one command tool & update readme.md

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 .devops/full.Dockerfile      | 17 ++++++++++
 .devops/main.Dockerfile      | 18 ++++++++++
 .devops/tools.sh             | 46 +++++++++++++++++++++++++
 .dockerignore                | 24 +++++++++++++
 .github/workflows/build.yml  |  2 +-
 .github/workflows/docker.yml | 61 +++++++++++++++++++++++++++++++++
 README.md                    | 32 +++++++++++++++++
 convert-pth-to-ggml.py       |  6 +++-
 download-pth.py              | 66 ++++++++++++++++++++++++++++++++++++
 9 files changed, 270 insertions(+), 2 deletions(-)
 create mode 100644 .devops/full.Dockerfile
 create mode 100644 .devops/main.Dockerfile
 create mode 100755 .devops/tools.sh
 create mode 100644 .dockerignore
 create mode 100644 .github/workflows/docker.yml
 create mode 100644 download-pth.py

diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile
new file mode 100644
index 000000000..618cdddc4
--- /dev/null
+++ b/.devops/full.Dockerfile
@@ -0,0 +1,17 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION as build
+
+RUN apt-get update && \
+    apt-get install -y build-essential python3 python3-pip
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install torch torchvision torchaudio sentencepiece numpy
+
+WORKDIR /app
+
+COPY . .
+
+RUN make
+
+ENTRYPOINT ["/app/.devops/tools.sh"]
\ No newline at end of file
diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile
new file mode 100644
index 000000000..cd575efa0
--- /dev/null
+++ b/.devops/main.Dockerfile
@@ -0,0 +1,18 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION as build
+
+RUN apt-get update && \
+    apt-get install -y build-essential
+
+WORKDIR /app
+
+COPY . .
+
+RUN make
+
+FROM ubuntu:$UBUNTU_VERSION as runtime
+
+COPY --from=build /app/main /main
+
+ENTRYPOINT [ "/main" ]
\ No newline at end of file
diff --git a/.devops/tools.sh b/.devops/tools.sh
new file mode 100755
index 000000000..b5711c94e
--- /dev/null
+++ b/.devops/tools.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+set -e
+
+# Read the first argument into a variable
+arg1="$1"
+
+# Shift the arguments to remove the first one
+shift
+
+# Join the remaining arguments into a single string
+arg2="$@"
+
+if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
+    python3 ./convert-pth-to-ggml.py $arg2
+elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
+    ./quantize $arg2
+elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
+    ./main $arg2
+elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
+    python3 ./download-pth.py $arg2
+elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
+    echo "Downloading model..."
+    python3 ./download-pth.py "$1" "$2"
+    echo "Converting PTH to GGML..."
+    for i in `ls $1/$2/ggml-model-f16.bin*`; do
+        if [ -f "${i/f16/q4_0}" ]; then
+            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
+        else
+            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
+            ./quantize "$i" "${i/f16/q4_0}" 2
+        fi
+    done
+else
+    echo "Unknown command: $arg1"
+    echo "Available commands: "
+    echo "  --run (-r): Run a model previously converted into ggml"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -t 8 -n 512"
+    echo "  --convert (-c): Convert a llama model into ggml"
+    echo "              ex: \"/models/7B/\" 1"
+    echo "  --quantize (-q): Optimize with quantization process ggml"
+    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
+    echo "  --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
+    echo "              ex: \"/models/\" 7B"
+    echo "  --all-in-one (-a): Execute --download, --convert & --quantize"
+    echo "              ex: \"/models/\" 7B"
+fi
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 000000000..952990f26
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,24 @@
+*.o
+*.a
+.cache/
+.vs/
+.vscode/
+.DS_Store
+
+build/
+build-em/
+build-debug/
+build-release/
+build-static/
+build-no-accel/
+build-sanitize-addr/
+build-sanitize-thread/
+
+models/*
+
+/main
+/quantize
+
+arm_neon.h
+compile_commands.json
+Dockerfile
\ No newline at end of file
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 1a068ae75..94f199cb8 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -19,7 +19,7 @@ jobs:
           make
 
   macOS-latest:
-    runs-on: macOS-latest
+    runs-on: macos-latest
 
     steps:
       - name: Clone
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
new file mode 100644
index 000000000..bc9aff7b7
--- /dev/null
+++ b/.github/workflows/docker.yml
@@ -0,0 +1,61 @@
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+# GitHub recommends pinning actions to a commit SHA.
+# To get a newer version, you will need to update the SHA.
+# You can also reference a tag or branch, but the action may change without warning.
+
+name: Publish Docker image
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+
+jobs:
+  push_to_registry:
+    name: Push Docker image to Docker Hub
+    runs-on: ubuntu-latest
+    env:
+      COMMIT_SHA: ${{ github.sha }}
+    strategy:
+      matrix:
+        config:
+          - { tag: "light", dockerfile: ".devops/main.Dockerfile" }
+          - { tag: "full", dockerfile: ".devops/full.Dockerfile" }
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v3
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v2
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push Docker image (versioned)
+        if: github.event_name == 'push'
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          push: true
+          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
+          file: ${{ matrix.config.dockerfile }}
+
+      - name: Build and push Docker image (tagged)
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          push: ${{ github.event_name == 'push' }}
+          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
+          file: ${{ matrix.config.dockerfile }}
\ No newline at end of file
diff --git a/README.md b/README.md
index 15e1b9a2d..8cf59f418 100644
--- a/README.md
+++ b/README.md
@@ -32,6 +32,7 @@ Supported platforms:
 - [X] Mac OS
 - [X] Linux
 - [X] Windows (via CMake)
+- [X] Docker
 
 ---
 
@@ -194,6 +195,37 @@ Finally, copy the `llama` binary and the model files to your device storage. Her
 
 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
 
+### Docker
+
+#### Prerequisites
+* Docker must be installed and running on your system.
+* Create a folder to store big models & intermediate files (in ex. im using /llama/models)
+
+#### Images
+We have two Docker images available for this project:
+
+1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
+2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file.
+
+#### Usage
+
+The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
+
+ ```bash
+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
+```
+
+On complete, you are ready to play!
+
+```bash
+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
+```
+
+or with light image:
+
+```bash
+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
+```
 
 ## Limitations
 
diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py
index 5c36e9c09..d0eb213c8 100644
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -16,7 +16,7 @@
 # At the start of the ggml file we write the model parameters
 # and vocabulary.
 #
-
+import os
 import sys
 import json
 import struct
@@ -64,6 +64,10 @@ if len(sys.argv) > 2:
         sys.exit(1)
     fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
 
+if os.path.exists(fname_out):
+    print(f"Skip conversion, it already exists: {fname_out}")
+    sys.exit(0)
+
 with open(fname_hparams, "r") as f:
     hparams = json.load(f)
 
diff --git a/download-pth.py b/download-pth.py
new file mode 100644
index 000000000..129532c0c
--- /dev/null
+++ b/download-pth.py
@@ -0,0 +1,66 @@
+import os
+import sys
+from tqdm import tqdm
+import requests
+
+if len(sys.argv) < 3:
+    print("Usage: download-pth.py dir-model model-type\n")
+    print("  model-type: Available models 7B, 13B, 30B or 65B")
+    sys.exit(1)
+
+modelsDir = sys.argv[1]
+model = sys.argv[2]
+
+num = {
+    "7B": 1,
+    "13B": 2,
+    "30B": 4,
+    "65B": 8,
+}
+
+if model not in num:
+    print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
+    sys.exit(1)
+
+print(f"Downloading model {model}")
+
+files = ["checklist.chk", "params.json"]
+
+for i in range(num[model]):
+    files.append(f"consolidated.0{i}.pth")
+
+resolved_path = os.path.abspath(os.path.join(modelsDir, model))
+os.makedirs(resolved_path, exist_ok=True)
+
+for file in files:
+    dest_path = os.path.join(resolved_path, file)
+    
+    if os.path.exists(dest_path):
+        print(f"Skip file download, it already exists: {file}")
+        continue
+
+    url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
+    response = requests.get(url, stream=True)
+    with open(dest_path, 'wb') as f:
+        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
+            for chunk in response.iter_content(chunk_size=1024):
+                if chunk:
+                    f.write(chunk)
+                    t.update(len(chunk))
+
+files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
+for file in files2:
+    dest_path = os.path.join(modelsDir, file)
+    
+    if os.path.exists(dest_path):
+        print(f"Skip file download, it already exists: {file}")
+        continue
+    
+    url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
+    response = requests.get(url, stream=True)
+    with open(dest_path, 'wb') as f:
+        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
+            for chunk in response.iter_content(chunk_size=1024):
+                if chunk:
+                    f.write(chunk)
+                    t.update(len(chunk))
\ No newline at end of file

From 6b0df5ccf360fe5c015f6607f0375bfc6849005e Mon Sep 17 00:00:00 2001
From: mmyjona <jonathan.gonse@gmail.com>
Date: Sat, 18 Mar 2023 00:38:24 +0800
Subject: [PATCH 2/8] add ptread link to fix cmake build under linux (#114)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add ptread link to fix cmake build under linux

* add cmake to linux and macos platform

* separate make and cmake workflow

---------

Co-authored-by: Sebastián A <sebastian.aedo29@gmail.com>
---
 .github/workflows/build.yml | 43 ++++++++++++++++++++++++++++++++++---
 CMakeLists.txt              |  3 +++
 2 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 94f199cb8..a94a38991 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -2,7 +2,7 @@ name: CI
 on: [push, pull_request]
 
 jobs:
-  ubuntu-latest:
+  ubuntu-latest-make:
     runs-on: ubuntu-latest
 
     steps:
@@ -18,7 +18,26 @@ jobs:
         run: |
           make
 
-  macOS-latest:
+  ubuntu-latest-cmake:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+
+      - name: Build
+        run: |
+          mkdir build
+          cd build
+          cmake ..
+          cmake --build . --config Release
+
+  macOS-latest-make:
     runs-on: macos-latest
 
     steps:
@@ -33,7 +52,25 @@ jobs:
         run: |
           make
 
-  windows-latest:
+  macOS-latest-cmake:
+    runs-on: macOS-latest
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        run: |
+          brew update
+
+      - name: Build
+        run: |
+          mkdir build
+          cd build
+          cmake ..
+          cmake --build . --config Release
+
+  windows-latest-cmake:
     runs-on: windows-latest
 
     steps:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ca3be38a5..38e7266dc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,6 +4,8 @@ project("llama.cpp")
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+find_package(Threads REQUIRED)
 
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
@@ -126,3 +128,4 @@ target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS})
 target_include_directories(ggml PUBLIC .)
 target_link_libraries(quantize PRIVATE ggml)
 target_link_libraries(llama PRIVATE ggml)
+target_link_libraries(ggml PRIVATE Threads::Threads)

From 367946c668757532deed929e1d78673c6ac6bcb8 Mon Sep 17 00:00:00 2001
From: Stephan Walter <stephan@walter.name>
Date: Fri, 17 Mar 2023 17:47:35 +0000
Subject: [PATCH 3/8] Don't tell users to use a bad number of threads (#243)

The readme tells people to use the command line option "-t 8", causing 8
threads to be started. On systems with fewer than 8 cores, this causes a
significant slowdown. Remove the option from the example command lines
and use /proc/cpuinfo on Linux to determine a sensible default.
---
 .devops/tools.sh |  2 +-
 README.md        | 10 +++++-----
 ggml.c           |  4 ----
 utils.cpp        | 12 ++++++++++++
 utils.h          |  2 +-
 5 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/.devops/tools.sh b/.devops/tools.sh
index b5711c94e..352e04942 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -34,7 +34,7 @@ else
     echo "Unknown command: $arg1"
     echo "Available commands: "
     echo "  --run (-r): Run a model previously converted into ggml"
-    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -t 8 -n 512"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
     echo "  --convert (-c): Convert a llama model into ggml"
     echo "              ex: \"/models/7B/\" 1"
     echo "  --quantize (-q): Optimize with quantization process ggml"
diff --git a/README.md b/README.md
index 8cf59f418..7338ea790 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ Supported platforms:
 Here is a typical run using LLaMA-7B:
 
 ```java
-make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
+make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 I llama.cpp build info:
 I UNAME_S:  Darwin
 I UNAME_P:  arm
@@ -150,7 +150,7 @@ python3 convert-pth-to-ggml.py models/7B/ 1
 ./quantize.sh 7B
 
 # run the inference
-./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128
+./main -m ./models/7B/ggml-model-q4_0.bin -n 128
 ```
 
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
@@ -164,7 +164,7 @@ In this mode, you can always interrupt generation by pressing Ctrl+C and enter o
 
 Here is an example few-shot interaction, invoked with the command
 ```
-./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
+./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
                                            -p \
 "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
 
@@ -218,13 +218,13 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-on
 On complete, you are ready to play!
 
 ```bash
-docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 
 or with light image:
 
 ```bash
-docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 
 ## Limitations
diff --git a/ggml.c b/ggml.c
index c4f838917..4fb83adbd 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9318,10 +9318,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 }
 
 void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
-    if (cgraph->n_threads <= 0) {
-        cgraph->n_threads = 8;
-    }
-
     const int n_threads = cgraph->n_threads;
 
     struct ggml_compute_state_shared state_shared = {
diff --git a/utils.cpp b/utils.cpp
index 26e313d5f..9e50487ef 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -16,6 +16,18 @@
  #endif
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
+    // determine sensible default number of threads.
+    // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
+#ifdef __linux__
+    std::ifstream cpuinfo("/proc/cpuinfo");
+    params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
+                                  std::istream_iterator<std::string>(),
+                                  std::string("processor"));
+#endif
+    if (params.n_threads == 0) {
+        params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
+    }
+
     for (int i = 1; i < argc; i++) {
         std::string arg = argv[i];
 
diff --git a/utils.h b/utils.h
index 021120b05..5e5b40ffa 100644
--- a/utils.h
+++ b/utils.h
@@ -14,7 +14,7 @@
 
 struct gpt_params {
     int32_t seed      = -1; // RNG seed
-    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_threads;
     int32_t n_predict = 128; // new tokens to predict
     int32_t repeat_last_n = 64;  // last n tokens to penalize
     int32_t n_ctx = 512; //context size

From e81b9c81c101f64531ef0fa1ee6b77d562635652 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 17 Mar 2023 20:30:04 +0200
Subject: [PATCH 4/8] Update Contributing section

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 7338ea790..808d54e89 100644
--- a/README.md
+++ b/README.md
@@ -242,6 +242,7 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models
 - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
 - Collaborators will be invited based on contributions
 - Any help with managing issues and PRs is very appreciated!
+- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
 
 ### Coding guidelines
 

From 4f546091102a418ffdc6230f872ac56e5cedb835 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 17 Mar 2023 21:46:46 +0200
Subject: [PATCH 5/8] Default to 4 threads (#243)

---
 utils.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils.h b/utils.h
index 5e5b40ffa..c1a8498a7 100644
--- a/utils.h
+++ b/utils.h
@@ -14,11 +14,11 @@
 
 struct gpt_params {
     int32_t seed      = -1; // RNG seed
-    int32_t n_threads;
+    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
     int32_t n_predict = 128; // new tokens to predict
     int32_t repeat_last_n = 64;  // last n tokens to penalize
     int32_t n_ctx = 512; //context size
-    
+
     // sampling parameters
     int32_t top_k = 40;
     float   top_p = 0.95f;

From c9f670a17755311aa28c411f5c7f3c8c05434770 Mon Sep 17 00:00:00 2001
From: thement <40525767+thement@users.noreply.github.com>
Date: Fri, 17 Mar 2023 21:05:58 +0100
Subject: [PATCH 6/8] Implement non-greedy tokenizer that tries to maximize
 token lengths (#242)

* Implement non-greedy tokenizer that tries to maximize token lengths

* Insert single space in front of the prompt

- this is to match original llama tokenizer behavior

---------

Co-authored-by: Jakub Horak <jakub.horak@ibawizard.net>
---
 main.cpp  |  2 ++
 utils.cpp | 70 ++++++++++++++++++++++++++++++++++---------------------
 2 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/main.cpp b/main.cpp
index ca0fca8b3..39c5d7b76 100644
--- a/main.cpp
+++ b/main.cpp
@@ -845,6 +845,8 @@ int main(int argc, char ** argv) {
 
     std::vector<float> logits;
 
+    // Add a space in front of the first character to match OG llama tokenizer behavior
+    params.prompt.insert(0, 1, ' ');
     // tokenize the prompt
     std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
 
diff --git a/utils.cpp b/utils.cpp
index 9e50487ef..22ef59377 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -287,40 +287,56 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
     return tokens;
 }
 
+// TODO: Calculate this constant from the vocabulary
+#define MAX_TOKEN_LEN 18
+// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
 std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
-    //auto res = gpt_tokenize(vocab, text);
-
-    //if (bos) {
-    //    res.insert(res.begin(), 1); // TODO: replace with vocab.bos
-    //}
-
     std::vector<gpt_vocab::id> res;
+    std::vector<int> score;
+    std::vector<gpt_vocab::id> prev;
+    int len = text.length();
+
+    score.resize(len + 1);
+    prev.resize(len + 1);
+
+    // Forward pass
+    for (int i = 0; i < len; i++) {
+        int max_len = std::min(len - i, MAX_TOKEN_LEN);
+        for (int sub_len = 1; sub_len <= len - i; sub_len++) {
+            auto sub = text.substr(i, sub_len);
+            auto token = vocab.token_to_id.find(sub);
+            if (token != vocab.token_to_id.end()) {
+                int token_score = sub.length() * sub.length();
+                int local_score = score[i] + token_score;
+                int next = i + sub_len;
+                if (score[next] < local_score) {
+                    score[next] = local_score;
+                    prev[next] = (*token).second;
+                }
+            }
+        }
+    }
+
+    // Backward pass
+    int i = len;
+    while (i > 0) {
+        gpt_vocab::id token_id = prev[i];
+        if (token_id == 0) {
+	    // TODO: Return error or something more meaningful
+            printf("failed to tokenize string!\n");
+	    break;
+        }
+        res.push_back(token_id);
+        auto token = (*vocab.id_to_token.find(token_id)).second;
+        i -= token.length();
+    }
 
     if (bos) {
         res.push_back(1); // TODO: replace with vocab.bos
     }
 
-     //find the longest token that matches the text
-    int pos = 0;
-    while (true) {
-        int l = 0;
-        int t = 0;
-        for (const auto & kv : vocab.id_to_token) {
-            if (kv.second.size() < l) continue;
-            if (kv.second.size() > text.size() - pos) continue;
-            if (text.substr(pos, kv.second.size()) == kv.second) {
-                l = kv.second.size();
-                t = kv.first;
-            }
-        }
-
-        if (l == 0) {
-            break;
-        }
-
-        res.push_back(t);
-        pos += l;
-    }
+    // Pieces are in reverse order so correct that
+    std::reverse(res.begin(), res.end());
 
     return res;
 }

From a29274789309029fd88a9465e6d0832d4632272b Mon Sep 17 00:00:00 2001
From: Niklas Korz <niklas@niklaskorz.de>
Date: Fri, 17 Mar 2023 23:03:48 +0100
Subject: [PATCH 7/8] Nix flake (#40)

* Nix flake

* Nix: only add Accelerate framework on macOS

* Nix: development shel, direnv and compatibility

* Nix: use python packages supplied by withPackages

* Nix: remove channel compatibility

* Nix: fix ARM neon dotproduct on macOS

---------

Co-authored-by: Pavol Rusnak <pavol@rusnak.io>
---
 .gitignore |  4 ++++
 flake.lock | 43 +++++++++++++++++++++++++++++++++++++++++++
 flake.nix  | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+)
 create mode 100644 flake.lock
 create mode 100644 flake.nix

diff --git a/.gitignore b/.gitignore
index 5eb1ff1b8..3087b0ea5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,10 @@ models/*
 
 /main
 /quantize
+/result
 
 arm_neon.h
 compile_commands.json
+
+.envrc
+.direnv/
diff --git a/flake.lock b/flake.lock
new file mode 100644
index 000000000..343996da1
--- /dev/null
+++ b/flake.lock
@@ -0,0 +1,43 @@
+{
+  "nodes": {
+    "flake-utils": {
+      "locked": {
+        "lastModified": 1676283394,
+        "narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1678470307,
+        "narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "flake-utils": "flake-utils",
+        "nixpkgs": "nixpkgs"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 000000000..dae4ff60f
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,48 @@
+{
+  inputs = {
+    nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
+    flake-utils.url = "github:numtide/flake-utils";
+  };
+  outputs = { self, nixpkgs, flake-utils }:
+    flake-utils.lib.eachDefaultSystem (system:
+      let
+        pkgs = import nixpkgs {
+          inherit system;
+        };
+        llama-python = pkgs.python310.withPackages (ps: with ps; [
+          torch
+          numpy
+          sentencepiece
+        ]);
+      in
+      {
+        packages.default = pkgs.stdenv.mkDerivation {
+          name = "llama.cpp";
+          src = ./.;
+          nativeBuildInputs = with pkgs; [ cmake ];
+          buildInputs = with pkgs; lib.optionals stdenv.isDarwin [
+            darwin.apple_sdk.frameworks.Accelerate
+          ];
+          cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [
+            "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
+          ];
+          installPhase = ''
+            mkdir -p $out/bin
+            mv llama $out/bin/llama
+            mv quantize $out/bin/quantize
+            echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
+            cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
+            chmod +x $out/bin/convert-pth-to-ggml
+          '';
+        };
+        devShells.default = pkgs.mkShell {
+          packages = with pkgs; [
+            cmake
+            llama-python
+          ] ++ lib.optionals stdenv.isDarwin [
+            darwin.apple_sdk.frameworks.Accelerate
+          ];
+        };
+      }
+    );
+}

From b2de7f18dfbb93463eeb5b4392117bbe82d5bd1b Mon Sep 17 00:00:00 2001
From: anzz1 <anzz1@live.com>
Date: Sat, 18 Mar 2023 09:27:12 +0200
Subject: [PATCH 8/8] CI Improvements (#230)

* CI Improvements

Manual build feature, autoreleases for Windows

* better CI naming convention

use branch name in releases and tags
---
 .github/workflows/build.yml | 57 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index a94a38991..9c1de5823 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,5 +1,20 @@
 name: CI
-on: [push, pull_request]
+
+on:
+  workflow_dispatch: # allows manual triggering
+    inputs:
+      create_release:
+        description: 'Create new release'
+        required: true
+        type: boolean
+  push:
+    paths: ['.github/workflows/**', 'CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
+  pull_request:
+    types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
+    paths: ['CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
+
+env:
+ BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
 
 jobs:
   ubuntu-latest-make:
@@ -7,14 +22,17 @@ jobs:
 
     steps:
       - name: Clone
+        id: checkout
         uses: actions/checkout@v1
 
       - name: Dependencies
+        id: depends
         run: |
           sudo apt-get update
           sudo apt-get install build-essential
 
       - name: Build
+        id: make_build
         run: |
           make
 
@@ -42,13 +60,16 @@ jobs:
 
     steps:
       - name: Clone
+        id: checkout
         uses: actions/checkout@v1
 
       - name: Dependencies
+        id: depends
         run: |
           brew update
 
       - name: Build
+        id: make_build
         run: |
           make
 
@@ -75,15 +96,49 @@ jobs:
 
     steps:
       - name: Clone
+        id: checkout
         uses: actions/checkout@v1
 
       - name: Build
+        id: cmake_build
         run: |
           mkdir build
           cd build
           cmake ..
           cmake --build . --config Release
 
+      - name: Get commit hash
+        id: commit
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: pr-mpt/actions-commit-hash@v2
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip .\build\Release\*
+
+      - name: Create release
+        id: create_release
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: zendesk/action-create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
+
+      - name: Upload release
+        id: upload_release
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ steps.create_release.outputs.upload_url }} 
+          asset_path: .\llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
+          asset_name: llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
+          asset_content_type: application/octet-stream
+
 #  ubuntu-latest-gcc:
 #    runs-on: ubuntu-latest
 #