From 2af23d30434a677c6416812eea52ccc0af65119c Mon Sep 17 00:00:00 2001 From: Bernat Vadell Date: Fri, 17 Mar 2023 10:47:06 +0100 Subject: [PATCH 1/8] =?UTF-8?q?=F0=9F=9A=80=20Dockerize=20llamacpp=20(#132?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: dockerize llamacpp * feat: split build & runtime stages * split dockerfile into main & tools * add quantize into tool docker image * Update .devops/tools.sh Co-authored-by: Georgi Gerganov * add docker action pipeline * change CI to publish at github docker registry * fix name runs-on macOS-latest is macos-latest (lowercase) * include docker versioned images * fix github action docker * fix docker.yml * feat: include all-in-one command tool & update readme.md --------- Co-authored-by: Georgi Gerganov --- .devops/full.Dockerfile | 17 ++++++++++ .devops/main.Dockerfile | 18 ++++++++++ .devops/tools.sh | 46 +++++++++++++++++++++++++ .dockerignore | 24 +++++++++++++ .github/workflows/build.yml | 2 +- .github/workflows/docker.yml | 61 +++++++++++++++++++++++++++++++++ README.md | 32 +++++++++++++++++ convert-pth-to-ggml.py | 6 +++- download-pth.py | 66 ++++++++++++++++++++++++++++++++++++ 9 files changed, 270 insertions(+), 2 deletions(-) create mode 100644 .devops/full.Dockerfile create mode 100644 .devops/main.Dockerfile create mode 100755 .devops/tools.sh create mode 100644 .dockerignore create mode 100644 .github/workflows/docker.yml create mode 100644 download-pth.py diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile new file mode 100644 index 000000000..618cdddc4 --- /dev/null +++ b/.devops/full.Dockerfile @@ -0,0 +1,17 @@ +ARG UBUNTU_VERSION=22.04 + +FROM ubuntu:$UBUNTU_VERSION as build + +RUN apt-get update && \ + apt-get install -y build-essential python3 python3-pip + +RUN pip install --upgrade pip setuptools wheel \ + && pip install torch torchvision torchaudio sentencepiece numpy + +WORKDIR /app + +COPY . . + +RUN make + +ENTRYPOINT ["/app/.devops/tools.sh"] \ No newline at end of file diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile new file mode 100644 index 000000000..cd575efa0 --- /dev/null +++ b/.devops/main.Dockerfile @@ -0,0 +1,18 @@ +ARG UBUNTU_VERSION=22.04 + +FROM ubuntu:$UBUNTU_VERSION as build + +RUN apt-get update && \ + apt-get install -y build-essential + +WORKDIR /app + +COPY . . + +RUN make + +FROM ubuntu:$UBUNTU_VERSION as runtime + +COPY --from=build /app/main /main + +ENTRYPOINT [ "/main" ] \ No newline at end of file diff --git a/.devops/tools.sh b/.devops/tools.sh new file mode 100755 index 000000000..b5711c94e --- /dev/null +++ b/.devops/tools.sh @@ -0,0 +1,46 @@ +#!/bin/bash +set -e + +# Read the first argument into a variable +arg1="$1" + +# Shift the arguments to remove the first one +shift + +# Join the remaining arguments into a single string +arg2="$@" + +if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then + python3 ./convert-pth-to-ggml.py $arg2 +elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then + ./quantize $arg2 +elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then + ./main $arg2 +elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then + python3 ./download-pth.py $arg2 +elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then + echo "Downloading model..." + python3 ./download-pth.py "$1" "$2" + echo "Converting PTH to GGML..." + for i in `ls $1/$2/ggml-model-f16.bin*`; do + if [ -f "${i/f16/q4_0}" ]; then + echo "Skip model quantization, it already exists: ${i/f16/q4_0}" + else + echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." + ./quantize "$i" "${i/f16/q4_0}" 2 + fi + done +else + echo "Unknown command: $arg1" + echo "Available commands: " + echo " --run (-r): Run a model previously converted into ggml" + echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -t 8 -n 512" + echo " --convert (-c): Convert a llama model into ggml" + echo " ex: \"/models/7B/\" 1" + echo " --quantize (-q): Optimize with quantization process ggml" + echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2" + echo " --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/" + echo " ex: \"/models/\" 7B" + echo " --all-in-one (-a): Execute --download, --convert & --quantize" + echo " ex: \"/models/\" 7B" +fi diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..952990f26 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,24 @@ +*.o +*.a +.cache/ +.vs/ +.vscode/ +.DS_Store + +build/ +build-em/ +build-debug/ +build-release/ +build-static/ +build-no-accel/ +build-sanitize-addr/ +build-sanitize-thread/ + +models/* + +/main +/quantize + +arm_neon.h +compile_commands.json +Dockerfile \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1a068ae75..94f199cb8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,7 +19,7 @@ jobs: make macOS-latest: - runs-on: macOS-latest + runs-on: macos-latest steps: - name: Clone diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml new file mode 100644 index 000000000..bc9aff7b7 --- /dev/null +++ b/.github/workflows/docker.yml @@ -0,0 +1,61 @@ +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +# GitHub recommends pinning actions to a commit SHA. +# To get a newer version, you will need to update the SHA. +# You can also reference a tag or branch, but the action may change without warning. + +name: Publish Docker image + +on: + pull_request: + push: + branches: + - master + +jobs: + push_to_registry: + name: Push Docker image to Docker Hub + runs-on: ubuntu-latest + env: + COMMIT_SHA: ${{ github.sha }} + strategy: + matrix: + config: + - { tag: "light", dockerfile: ".devops/main.Dockerfile" } + - { tag: "full", dockerfile: ".devops/full.Dockerfile" } + steps: + - name: Check out the repo + uses: actions/checkout@v3 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Log in to Docker Hub + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push Docker image (versioned) + if: github.event_name == 'push' + uses: docker/build-push-action@v4 + with: + context: . + push: true + tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" + file: ${{ matrix.config.dockerfile }} + + - name: Build and push Docker image (tagged) + uses: docker/build-push-action@v4 + with: + context: . + push: ${{ github.event_name == 'push' }} + tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}" + file: ${{ matrix.config.dockerfile }} \ No newline at end of file diff --git a/README.md b/README.md index 15e1b9a2d..8cf59f418 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ Supported platforms: - [X] Mac OS - [X] Linux - [X] Windows (via CMake) +- [X] Docker --- @@ -194,6 +195,37 @@ Finally, copy the `llama` binary and the model files to your device storage. Her https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4 +### Docker + +#### Prerequisites +* Docker must be installed and running on your system. +* Create a folder to store big models & intermediate files (in ex. im using /llama/models) + +#### Images +We have two Docker images available for this project: + +1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. +2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. + +#### Usage + +The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image. + + ```bash +docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B +``` + +On complete, you are ready to play! + +```bash +docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 +``` + +or with light image: + +```bash +docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 +``` ## Limitations diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py index 5c36e9c09..d0eb213c8 100644 --- a/convert-pth-to-ggml.py +++ b/convert-pth-to-ggml.py @@ -16,7 +16,7 @@ # At the start of the ggml file we write the model parameters # and vocabulary. # - +import os import sys import json import struct @@ -64,6 +64,10 @@ if len(sys.argv) > 2: sys.exit(1) fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" +if os.path.exists(fname_out): + print(f"Skip conversion, it already exists: {fname_out}") + sys.exit(0) + with open(fname_hparams, "r") as f: hparams = json.load(f) diff --git a/download-pth.py b/download-pth.py new file mode 100644 index 000000000..129532c0c --- /dev/null +++ b/download-pth.py @@ -0,0 +1,66 @@ +import os +import sys +from tqdm import tqdm +import requests + +if len(sys.argv) < 3: + print("Usage: download-pth.py dir-model model-type\n") + print(" model-type: Available models 7B, 13B, 30B or 65B") + sys.exit(1) + +modelsDir = sys.argv[1] +model = sys.argv[2] + +num = { + "7B": 1, + "13B": 2, + "30B": 4, + "65B": 8, +} + +if model not in num: + print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B") + sys.exit(1) + +print(f"Downloading model {model}") + +files = ["checklist.chk", "params.json"] + +for i in range(num[model]): + files.append(f"consolidated.0{i}.pth") + +resolved_path = os.path.abspath(os.path.join(modelsDir, model)) +os.makedirs(resolved_path, exist_ok=True) + +for file in files: + dest_path = os.path.join(resolved_path, file) + + if os.path.exists(dest_path): + print(f"Skip file download, it already exists: {file}") + continue + + url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}" + response = requests.get(url, stream=True) + with open(dest_path, 'wb') as f: + with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t: + for chunk in response.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + t.update(len(chunk)) + +files2 = ["tokenizer_checklist.chk", "tokenizer.model"] +for file in files2: + dest_path = os.path.join(modelsDir, file) + + if os.path.exists(dest_path): + print(f"Skip file download, it already exists: {file}") + continue + + url = f"https://agi.gpt4.org/llama/LLaMA/{file}" + response = requests.get(url, stream=True) + with open(dest_path, 'wb') as f: + with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t: + for chunk in response.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + t.update(len(chunk)) \ No newline at end of file From 6b0df5ccf360fe5c015f6607f0375bfc6849005e Mon Sep 17 00:00:00 2001 From: mmyjona Date: Sat, 18 Mar 2023 00:38:24 +0800 Subject: [PATCH 2/8] add ptread link to fix cmake build under linux (#114) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add ptread link to fix cmake build under linux * add cmake to linux and macos platform * separate make and cmake workflow --------- Co-authored-by: Sebastián A --- .github/workflows/build.yml | 43 ++++++++++++++++++++++++++++++++++--- CMakeLists.txt | 3 +++ 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 94f199cb8..a94a38991 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -2,7 +2,7 @@ name: CI on: [push, pull_request] jobs: - ubuntu-latest: + ubuntu-latest-make: runs-on: ubuntu-latest steps: @@ -18,7 +18,26 @@ jobs: run: | make - macOS-latest: + ubuntu-latest-cmake: + runs-on: ubuntu-latest + + steps: + - name: Clone + uses: actions/checkout@v1 + + - name: Dependencies + run: | + sudo apt-get update + sudo apt-get install build-essential + + - name: Build + run: | + mkdir build + cd build + cmake .. + cmake --build . --config Release + + macOS-latest-make: runs-on: macos-latest steps: @@ -33,7 +52,25 @@ jobs: run: | make - windows-latest: + macOS-latest-cmake: + runs-on: macOS-latest + + steps: + - name: Clone + uses: actions/checkout@v1 + + - name: Dependencies + run: | + brew update + + - name: Build + run: | + mkdir build + cd build + cmake .. + cmake --build . --config Release + + windows-latest-cmake: runs-on: windows-latest steps: diff --git a/CMakeLists.txt b/CMakeLists.txt index ca3be38a5..38e7266dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,8 @@ project("llama.cpp") set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED true) set(CMAKE_C_STANDARD 11) +set(THREADS_PREFER_PTHREAD_FLAG ON) +find_package(Threads REQUIRED) if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) @@ -126,3 +128,4 @@ target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS}) target_include_directories(ggml PUBLIC .) target_link_libraries(quantize PRIVATE ggml) target_link_libraries(llama PRIVATE ggml) +target_link_libraries(ggml PRIVATE Threads::Threads) From 367946c668757532deed929e1d78673c6ac6bcb8 Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Fri, 17 Mar 2023 17:47:35 +0000 Subject: [PATCH 3/8] Don't tell users to use a bad number of threads (#243) The readme tells people to use the command line option "-t 8", causing 8 threads to be started. On systems with fewer than 8 cores, this causes a significant slowdown. Remove the option from the example command lines and use /proc/cpuinfo on Linux to determine a sensible default. --- .devops/tools.sh | 2 +- README.md | 10 +++++----- ggml.c | 4 ---- utils.cpp | 12 ++++++++++++ utils.h | 2 +- 5 files changed, 19 insertions(+), 11 deletions(-) diff --git a/.devops/tools.sh b/.devops/tools.sh index b5711c94e..352e04942 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -34,7 +34,7 @@ else echo "Unknown command: $arg1" echo "Available commands: " echo " --run (-r): Run a model previously converted into ggml" - echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -t 8 -n 512" + echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512" echo " --convert (-c): Convert a llama model into ggml" echo " ex: \"/models/7B/\" 1" echo " --quantize (-q): Optimize with quantization process ggml" diff --git a/README.md b/README.md index 8cf59f418..7338ea790 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ Supported platforms: Here is a typical run using LLaMA-7B: ```java -make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 +make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 I llama.cpp build info: I UNAME_S: Darwin I UNAME_P: arm @@ -150,7 +150,7 @@ python3 convert-pth-to-ggml.py models/7B/ 1 ./quantize.sh 7B # run the inference -./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128 +./main -m ./models/7B/ggml-model-q4_0.bin -n 128 ``` When running the larger models, make sure you have enough disk space to store all the intermediate files. @@ -164,7 +164,7 @@ In this mode, you can always interrupt generation by pressing Ctrl+C and enter o Here is an example few-shot interaction, invoked with the command ``` -./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 256 --repeat_penalty 1.0 --color -i -r "User:" \ +./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" \ -p \ "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision. @@ -218,13 +218,13 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-on On complete, you are ready to play! ```bash -docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 +docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 ``` or with light image: ```bash -docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 +docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 ``` ## Limitations diff --git a/ggml.c b/ggml.c index c4f838917..4fb83adbd 100644 --- a/ggml.c +++ b/ggml.c @@ -9318,10 +9318,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { - if (cgraph->n_threads <= 0) { - cgraph->n_threads = 8; - } - const int n_threads = cgraph->n_threads; struct ggml_compute_state_shared state_shared = { diff --git a/utils.cpp b/utils.cpp index 26e313d5f..9e50487ef 100644 --- a/utils.cpp +++ b/utils.cpp @@ -16,6 +16,18 @@ #endif bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { + // determine sensible default number of threads. + // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0. +#ifdef __linux__ + std::ifstream cpuinfo("/proc/cpuinfo"); + params.n_threads = std::count(std::istream_iterator(cpuinfo), + std::istream_iterator(), + std::string("processor")); +#endif + if (params.n_threads == 0) { + params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency()); + } + for (int i = 1; i < argc; i++) { std::string arg = argv[i]; diff --git a/utils.h b/utils.h index 021120b05..5e5b40ffa 100644 --- a/utils.h +++ b/utils.h @@ -14,7 +14,7 @@ struct gpt_params { int32_t seed = -1; // RNG seed - int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); + int32_t n_threads; int32_t n_predict = 128; // new tokens to predict int32_t repeat_last_n = 64; // last n tokens to penalize int32_t n_ctx = 512; //context size From e81b9c81c101f64531ef0fa1ee6b77d562635652 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 17 Mar 2023 20:30:04 +0200 Subject: [PATCH 4/8] Update Contributing section --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 7338ea790..808d54e89 100644 --- a/README.md +++ b/README.md @@ -242,6 +242,7 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch - Collaborators will be invited based on contributions - Any help with managing issues and PRs is very appreciated! +- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205) ### Coding guidelines From 4f546091102a418ffdc6230f872ac56e5cedb835 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 17 Mar 2023 21:46:46 +0200 Subject: [PATCH 5/8] Default to 4 threads (#243) --- utils.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils.h b/utils.h index 5e5b40ffa..c1a8498a7 100644 --- a/utils.h +++ b/utils.h @@ -14,11 +14,11 @@ struct gpt_params { int32_t seed = -1; // RNG seed - int32_t n_threads; + int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); int32_t n_predict = 128; // new tokens to predict int32_t repeat_last_n = 64; // last n tokens to penalize int32_t n_ctx = 512; //context size - + // sampling parameters int32_t top_k = 40; float top_p = 0.95f; From c9f670a17755311aa28c411f5c7f3c8c05434770 Mon Sep 17 00:00:00 2001 From: thement <40525767+thement@users.noreply.github.com> Date: Fri, 17 Mar 2023 21:05:58 +0100 Subject: [PATCH 6/8] Implement non-greedy tokenizer that tries to maximize token lengths (#242) * Implement non-greedy tokenizer that tries to maximize token lengths * Insert single space in front of the prompt - this is to match original llama tokenizer behavior --------- Co-authored-by: Jakub Horak --- main.cpp | 2 ++ utils.cpp | 70 ++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 45 insertions(+), 27 deletions(-) diff --git a/main.cpp b/main.cpp index ca0fca8b3..39c5d7b76 100644 --- a/main.cpp +++ b/main.cpp @@ -845,6 +845,8 @@ int main(int argc, char ** argv) { std::vector logits; + // Add a space in front of the first character to match OG llama tokenizer behavior + params.prompt.insert(0, 1, ' '); // tokenize the prompt std::vector embd_inp = ::llama_tokenize(vocab, params.prompt, true); diff --git a/utils.cpp b/utils.cpp index 9e50487ef..22ef59377 100644 --- a/utils.cpp +++ b/utils.cpp @@ -287,40 +287,56 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri return tokens; } +// TODO: Calculate this constant from the vocabulary +#define MAX_TOKEN_LEN 18 +// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) { - //auto res = gpt_tokenize(vocab, text); - - //if (bos) { - // res.insert(res.begin(), 1); // TODO: replace with vocab.bos - //} - std::vector res; + std::vector score; + std::vector prev; + int len = text.length(); + + score.resize(len + 1); + prev.resize(len + 1); + + // Forward pass + for (int i = 0; i < len; i++) { + int max_len = std::min(len - i, MAX_TOKEN_LEN); + for (int sub_len = 1; sub_len <= len - i; sub_len++) { + auto sub = text.substr(i, sub_len); + auto token = vocab.token_to_id.find(sub); + if (token != vocab.token_to_id.end()) { + int token_score = sub.length() * sub.length(); + int local_score = score[i] + token_score; + int next = i + sub_len; + if (score[next] < local_score) { + score[next] = local_score; + prev[next] = (*token).second; + } + } + } + } + + // Backward pass + int i = len; + while (i > 0) { + gpt_vocab::id token_id = prev[i]; + if (token_id == 0) { + // TODO: Return error or something more meaningful + printf("failed to tokenize string!\n"); + break; + } + res.push_back(token_id); + auto token = (*vocab.id_to_token.find(token_id)).second; + i -= token.length(); + } if (bos) { res.push_back(1); // TODO: replace with vocab.bos } - //find the longest token that matches the text - int pos = 0; - while (true) { - int l = 0; - int t = 0; - for (const auto & kv : vocab.id_to_token) { - if (kv.second.size() < l) continue; - if (kv.second.size() > text.size() - pos) continue; - if (text.substr(pos, kv.second.size()) == kv.second) { - l = kv.second.size(); - t = kv.first; - } - } - - if (l == 0) { - break; - } - - res.push_back(t); - pos += l; - } + // Pieces are in reverse order so correct that + std::reverse(res.begin(), res.end()); return res; } From a29274789309029fd88a9465e6d0832d4632272b Mon Sep 17 00:00:00 2001 From: Niklas Korz Date: Fri, 17 Mar 2023 23:03:48 +0100 Subject: [PATCH 7/8] Nix flake (#40) * Nix flake * Nix: only add Accelerate framework on macOS * Nix: development shel, direnv and compatibility * Nix: use python packages supplied by withPackages * Nix: remove channel compatibility * Nix: fix ARM neon dotproduct on macOS --------- Co-authored-by: Pavol Rusnak --- .gitignore | 4 ++++ flake.lock | 43 +++++++++++++++++++++++++++++++++++++++++++ flake.nix | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+) create mode 100644 flake.lock create mode 100644 flake.nix diff --git a/.gitignore b/.gitignore index 5eb1ff1b8..3087b0ea5 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,10 @@ models/* /main /quantize +/result arm_neon.h compile_commands.json + +.envrc +.direnv/ diff --git a/flake.lock b/flake.lock new file mode 100644 index 000000000..343996da1 --- /dev/null +++ b/flake.lock @@ -0,0 +1,43 @@ +{ + "nodes": { + "flake-utils": { + "locked": { + "lastModified": 1676283394, + "narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1678470307, + "narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 000000000..dae4ff60f --- /dev/null +++ b/flake.nix @@ -0,0 +1,48 @@ +{ + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + flake-utils.url = "github:numtide/flake-utils"; + }; + outputs = { self, nixpkgs, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = import nixpkgs { + inherit system; + }; + llama-python = pkgs.python310.withPackages (ps: with ps; [ + torch + numpy + sentencepiece + ]); + in + { + packages.default = pkgs.stdenv.mkDerivation { + name = "llama.cpp"; + src = ./.; + nativeBuildInputs = with pkgs; [ cmake ]; + buildInputs = with pkgs; lib.optionals stdenv.isDarwin [ + darwin.apple_sdk.frameworks.Accelerate + ]; + cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [ + "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" + ]; + installPhase = '' + mkdir -p $out/bin + mv llama $out/bin/llama + mv quantize $out/bin/quantize + echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml + cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml + chmod +x $out/bin/convert-pth-to-ggml + ''; + }; + devShells.default = pkgs.mkShell { + packages = with pkgs; [ + cmake + llama-python + ] ++ lib.optionals stdenv.isDarwin [ + darwin.apple_sdk.frameworks.Accelerate + ]; + }; + } + ); +} From b2de7f18dfbb93463eeb5b4392117bbe82d5bd1b Mon Sep 17 00:00:00 2001 From: anzz1 Date: Sat, 18 Mar 2023 09:27:12 +0200 Subject: [PATCH 8/8] CI Improvements (#230) * CI Improvements Manual build feature, autoreleases for Windows * better CI naming convention use branch name in releases and tags --- .github/workflows/build.yml | 57 ++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a94a38991..9c1de5823 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,5 +1,20 @@ name: CI -on: [push, pull_request] + +on: + workflow_dispatch: # allows manual triggering + inputs: + create_release: + description: 'Create new release' + required: true + type: boolean + push: + paths: ['.github/workflows/**', 'CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp'] + pull_request: + types: [opened, synchronize, edited, reopened, review_requested, ready_for_review] + paths: ['CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp'] + +env: + BRANCH_NAME: ${{ github.head_ref || github.ref_name }} jobs: ubuntu-latest-make: @@ -7,14 +22,17 @@ jobs: steps: - name: Clone + id: checkout uses: actions/checkout@v1 - name: Dependencies + id: depends run: | sudo apt-get update sudo apt-get install build-essential - name: Build + id: make_build run: | make @@ -42,13 +60,16 @@ jobs: steps: - name: Clone + id: checkout uses: actions/checkout@v1 - name: Dependencies + id: depends run: | brew update - name: Build + id: make_build run: | make @@ -75,15 +96,49 @@ jobs: steps: - name: Clone + id: checkout uses: actions/checkout@v1 - name: Build + id: cmake_build run: | mkdir build cd build cmake .. cmake --build . --config Release + - name: Get commit hash + id: commit + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: pr-mpt/actions-commit-hash@v2 + + - name: Pack artifacts + id: pack_artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + run: | + 7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip .\build\Release\* + + - name: Create release + id: create_release + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: zendesk/action-create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }} + + - name: Upload release + id: upload_release + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ steps.create_release.outputs.upload_url }} + asset_path: .\llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip + asset_name: llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip + asset_content_type: application/octet-stream + # ubuntu-latest-gcc: # runs-on: ubuntu-latest #