Merge branch 'master' into optimize-convert

2023-03-18 18:41:28 +08:00 · 2023-03-18 18:41:28 +08:00 · a44ccef6ac
commit a44ccef6ac
parent fb324e035a b2de7f18df
17 changed files with 522 additions and 42 deletions
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -0,0 +1,17 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build
 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip
 RUN pip install --upgrade pip setuptools wheel \
    && pip install torch torchvision torchaudio sentencepiece numpy
 WORKDIR /app
 COPY . .
 RUN make
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@ -0,0 +1,18 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build
 RUN apt-get update && \
    apt-get install -y build-essential
 WORKDIR /app
 COPY . .
 RUN make
 FROM ubuntu:$UBUNTU_VERSION as runtime
 COPY --from=build /app/main /main
 ENTRYPOINT [ "/main" ]
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@ -0,0 +1,46 @@
 #!/bin/bash
 set -e
 # Read the first argument into a variable
 arg1="$1"
 # Shift the arguments to remove the first one
 shift
 # Join the remaining arguments into a single string
 arg2="$@"
 if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
    python3 ./convert-pth-to-ggml.py $arg2
 elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
    ./quantize $arg2
 elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
    ./main $arg2
 elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
    python3 ./download-pth.py $arg2
 elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
    echo "Downloading model..."
    python3 ./download-pth.py "$1" "$2"
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
        if [ -f "${i/f16/q4_0}" ]; then
            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
        else
            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
            ./quantize "$i" "${i/f16/q4_0}" 2
        fi
    done
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
    echo "  --run (-r): Run a model previously converted into ggml"
    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
    echo "  --convert (-c): Convert a llama model into ggml"
    echo "              ex: \"/models/7B/\" 1"
    echo "  --quantize (-q): Optimize with quantization process ggml"
    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
    echo "  --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
    echo "              ex: \"/models/\" 7B"
    echo "  --all-in-one (-a): Execute --download, --convert & --quantize"
    echo "              ex: \"/models/\" 7B"
 fi
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,24 @@
 *.o
 *.a
 .cache/
 .vs/
 .vscode/
 .DS_Store
 build/
 build-em/
 build-debug/
 build-release/
 build-static/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
 models/*
 /main
 /quantize
 arm_neon.h
 compile_commands.json
 Dockerfile
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -1,8 +1,42 @@
 name: CI
-on: [push, pull_request]
+
 on:
  workflow_dispatch: # allows manual triggering
    inputs:
      create_release:
        description: 'Create new release'
        required: true
        type: boolean
  push:
    paths: ['.github/workflows/**', 'CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
  pull_request:
    types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
    paths: ['CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
 env:
 BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
 jobs:
-  ubuntu-latest:
+  ubuntu-latest-make:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v1
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential
      - name: Build
        id: make_build
        run: |
          make
  ubuntu-latest-cmake:
    runs-on: ubuntu-latest
    steps:
@ -15,10 +49,31 @@ jobs:
          sudo apt-get install build-essential
      - name: Build
        run: |
          mkdir build
          cd build
          cmake ..
          cmake --build . --config Release
  macOS-latest-make:
    runs-on: macos-latest
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v1
      - name: Dependencies
        id: depends
        run: |
          brew update
      - name: Build
        id: make_build
        run: |
          make
-  macOS-latest:
+  macOS-latest-cmake:
    runs-on: macOS-latest
    steps:
@ -31,22 +86,59 @@ jobs:
      - name: Build
        run: |
-          make
+          mkdir build
          cd build
          cmake ..
          cmake --build . --config Release
-  windows-latest:
+  windows-latest-cmake:
    runs-on: windows-latest
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v1
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
          cmake ..
          cmake --build . --config Release
      - name: Get commit hash
        id: commit
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: pr-mpt/actions-commit-hash@v2
      - name: Pack artifacts
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip .\build\Release\*
      - name: Create release
        id: create_release
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: zendesk/action-create-release@v1
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
          tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
      - name: Upload release
        id: upload_release
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-release-asset@v1
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
          upload_url: ${{ steps.create_release.outputs.upload_url }} 
          asset_path: .\llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
          asset_name: llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
          asset_content_type: application/octet-stream
 #  ubuntu-latest-gcc:
 #    runs-on: ubuntu-latest
 #
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -0,0 +1,61 @@
 # This workflow uses actions that are not certified by GitHub.
 # They are provided by a third-party and are governed by
 # separate terms of service, privacy policy, and support
 # documentation.
 # GitHub recommends pinning actions to a commit SHA.
 # To get a newer version, you will need to update the SHA.
 # You can also reference a tag or branch, but the action may change without warning.
 name: Publish Docker image
 on:
  pull_request:
  push:
    branches:
      - master
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
    runs-on: ubuntu-latest
    env:
      COMMIT_SHA: ${{ github.sha }}
    strategy:
      matrix:
        config:
          - { tag: "light", dockerfile: ".devops/main.Dockerfile" }
          - { tag: "full", dockerfile: ".devops/full.Dockerfile" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v3
      - name: Set up QEMU
        uses: docker/setup-qemu-action@v2
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v2
      - name: Log in to Docker Hub
        uses: docker/login-action@v2
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}
      - name: Build and push Docker image (versioned)
        if: github.event_name == 'push'
        uses: docker/build-push-action@v4
        with:
          context: .
          push: true
          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
          file: ${{ matrix.config.dockerfile }}
      - name: Build and push Docker image (tagged)
        uses: docker/build-push-action@v4
        with:
          context: .
          push: ${{ github.event_name == 'push' }}
          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
          file: ${{ matrix.config.dockerfile }}
--- a/.gitignore
+++ b/.gitignore
@ -18,6 +18,10 @@ models/*
 /main
 /quantize
 /result
 arm_neon.h
 compile_commands.json
 .envrc
 .direnv/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -4,6 +4,8 @@ project("llama.cpp")
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
@ -126,3 +128,4 @@ target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS})
 target_include_directories(ggml PUBLIC .)
 target_link_libraries(quantize PRIVATE ggml)
 target_link_libraries(llama PRIVATE ggml)
 target_link_libraries(ggml PRIVATE Threads::Threads)
--- a/README.md
+++ b/README.md
@ -32,13 +32,14 @@ Supported platforms:
 - [X] Mac OS
 - [X] Linux
 - [X] Windows (via CMake)
 - [X] Docker
 ---
 Here is a typical run using LLaMA-7B:
 ```java
-make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
+make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 I llama.cpp build info:
 I UNAME_S:  Darwin
 I UNAME_P:  arm
@ -149,7 +150,7 @@ python3 convert-pth-to-ggml.py models/7B/ 1
 ./quantize.sh 7B
 # run the inference
-./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128
+./main -m ./models/7B/ggml-model-q4_0.bin -n 128
 ```
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
@ -163,7 +164,7 @@ In this mode, you can always interrupt generation by pressing Ctrl+C and enter o
 Here is an example few-shot interaction, invoked with the command
 ```
-./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
+./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
                                           -p \
 "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
@ -194,6 +195,37 @@ Finally, copy the `llama` binary and the model files to your device storage. Her
 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
 ### Docker
 #### Prerequisites
 * Docker must be installed and running on your system.
 * Create a folder to store big models & intermediate files (in ex. im using /llama/models)
 #### Images
 We have two Docker images available for this project:
 1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
 2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file.
 #### Usage
 The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
 ```bash
 docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
 ```
 On complete, you are ready to play!
 ```bash
 docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 or with light image:
 ```bash
 docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 ## Limitations
@ -210,6 +242,7 @@ https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b0
 - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
 - Collaborators will be invited based on contributions
 - Any help with managing issues and PRs is very appreciated!
 - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
 ### Coding guidelines
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@ -16,7 +16,7 @@
 # At the start of the ggml file we write the model parameters
 # and vocabulary.
 #
-
+import os
 import sys
 import json
 import struct
@ -59,7 +59,6 @@ def get_n_parts(dim):
        print("Invalid dim: " + str(dim))
        sys.exit(1)
 def main():
    args = parse_args()
    dir_model = args.dir_model
--- a/download-pth.py
+++ b/download-pth.py
@ -0,0 +1,66 @@
 import os
 import sys
 from tqdm import tqdm
 import requests
 if len(sys.argv) < 3:
    print("Usage: download-pth.py dir-model model-type\n")
    print("  model-type: Available models 7B, 13B, 30B or 65B")
    sys.exit(1)
 modelsDir = sys.argv[1]
 model = sys.argv[2]
 num = {
    "7B": 1,
    "13B": 2,
    "30B": 4,
    "65B": 8,
 }
 if model not in num:
    print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
    sys.exit(1)
 print(f"Downloading model {model}")
 files = ["checklist.chk", "params.json"]
 for i in range(num[model]):
    files.append(f"consolidated.0{i}.pth")
 resolved_path = os.path.abspath(os.path.join(modelsDir, model))
 os.makedirs(resolved_path, exist_ok=True)
 for file in files:
    dest_path = os.path.join(resolved_path, file)
    if os.path.exists(dest_path):
        print(f"Skip file download, it already exists: {file}")
        continue
    url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
    response = requests.get(url, stream=True)
    with open(dest_path, 'wb') as f:
        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    t.update(len(chunk))
 files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
 for file in files2:
    dest_path = os.path.join(modelsDir, file)
    if os.path.exists(dest_path):
        print(f"Skip file download, it already exists: {file}")
        continue
    url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
    response = requests.get(url, stream=True)
    with open(dest_path, 'wb') as f:
        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    t.update(len(chunk))
--- a/flake.lock
+++ b/flake.lock
@ -0,0 +1,43 @@
 {
  "nodes": {
    "flake-utils": {
      "locked": {
        "lastModified": 1676283394,
        "narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=",
        "owner": "numtide",
        "repo": "flake-utils",
        "rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073",
        "type": "github"
      },
      "original": {
        "owner": "numtide",
        "repo": "flake-utils",
        "type": "github"
      }
    },
    "nixpkgs": {
      "locked": {
        "lastModified": 1678470307,
        "narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=",
        "owner": "NixOS",
        "repo": "nixpkgs",
        "rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
        "ref": "nixos-unstable",
        "repo": "nixpkgs",
        "type": "github"
      }
    },
    "root": {
      "inputs": {
        "flake-utils": "flake-utils",
        "nixpkgs": "nixpkgs"
      }
    }
  },
  "root": "root",
  "version": 7
 }
--- a/flake.nix
+++ b/flake.nix
@ -0,0 +1,48 @@
 {
  inputs = {
    nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
    flake-utils.url = "github:numtide/flake-utils";
  };
  outputs = { self, nixpkgs, flake-utils }:
    flake-utils.lib.eachDefaultSystem (system:
      let
        pkgs = import nixpkgs {
          inherit system;
        };
        llama-python = pkgs.python310.withPackages (ps: with ps; [
          torch
          numpy
          sentencepiece
        ]);
      in
      {
        packages.default = pkgs.stdenv.mkDerivation {
          name = "llama.cpp";
          src = ./.;
          nativeBuildInputs = with pkgs; [ cmake ];
          buildInputs = with pkgs; lib.optionals stdenv.isDarwin [
            darwin.apple_sdk.frameworks.Accelerate
          ];
          cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [
            "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
          ];
          installPhase = ''
            mkdir -p $out/bin
            mv llama $out/bin/llama
            mv quantize $out/bin/quantize
            echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
            cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
            chmod +x $out/bin/convert-pth-to-ggml
          '';
        };
        devShells.default = pkgs.mkShell {
          packages = with pkgs; [
            cmake
            llama-python
          ] ++ lib.optionals stdenv.isDarwin [
            darwin.apple_sdk.frameworks.Accelerate
          ];
        };
      }
    );
 }
--- a/ggml.c
+++ b/ggml.c
@ -9318,10 +9318,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 }
 void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
    if (cgraph->n_threads <= 0) {
        cgraph->n_threads = 8;
    }
    const int n_threads = cgraph->n_threads;
    struct ggml_compute_state_shared state_shared = {
--- a/main.cpp
+++ b/main.cpp
@ -845,6 +845,8 @@ int main(int argc, char ** argv) {
    std::vector<float> logits;
    // Add a space in front of the first character to match OG llama tokenizer behavior
    params.prompt.insert(0, 1, ' ');
    // tokenize the prompt
    std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
--- a/utils.cpp
+++ b/utils.cpp
@ -16,6 +16,18 @@
 #endif
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    // determine sensible default number of threads.
    // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
 #ifdef __linux__
    std::ifstream cpuinfo("/proc/cpuinfo");
    params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
                                  std::istream_iterator<std::string>(),
                                  std::string("processor"));
 #endif
    if (params.n_threads == 0) {
        params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
    }
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
@ -275,40 +287,56 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
    return tokens;
 }
 // TODO: Calculate this constant from the vocabulary
 #define MAX_TOKEN_LEN 18
 // SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
 std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
    //auto res = gpt_tokenize(vocab, text);
    //if (bos) {
    //    res.insert(res.begin(), 1); // TODO: replace with vocab.bos
    //}
    std::vector<gpt_vocab::id> res;
    std::vector<int> score;
    std::vector<gpt_vocab::id> prev;
    int len = text.length();
    score.resize(len + 1);
    prev.resize(len + 1);
    // Forward pass
    for (int i = 0; i < len; i++) {
        int max_len = std::min(len - i, MAX_TOKEN_LEN);
        for (int sub_len = 1; sub_len <= len - i; sub_len++) {
            auto sub = text.substr(i, sub_len);
            auto token = vocab.token_to_id.find(sub);
            if (token != vocab.token_to_id.end()) {
                int token_score = sub.length() * sub.length();
                int local_score = score[i] + token_score;
                int next = i + sub_len;
                if (score[next] < local_score) {
                    score[next] = local_score;
                    prev[next] = (*token).second;
                }
            }
        }
    }
    // Backward pass
    int i = len;
    while (i > 0) {
        gpt_vocab::id token_id = prev[i];
        if (token_id == 0) {
 	    // TODO: Return error or something more meaningful
            printf("failed to tokenize string!\n");
 	    break;
        }
        res.push_back(token_id);
        auto token = (*vocab.id_to_token.find(token_id)).second;
        i -= token.length();
    }
    if (bos) {
        res.push_back(1); // TODO: replace with vocab.bos
    }
-     //find the longest token that matches the text
+    // Pieces are in reverse order so correct that
-    int pos = 0;
+    std::reverse(res.begin(), res.end());
    while (true) {
        int l = 0;
        int t = 0;
        for (const auto & kv : vocab.id_to_token) {
            if (kv.second.size() < l) continue;
            if (kv.second.size() > text.size() - pos) continue;
            if (text.substr(pos, kv.second.size()) == kv.second) {
                l = kv.second.size();
                t = kv.first;
            }
        }
        if (l == 0) {
            break;
        }
        res.push_back(t);
        pos += l;
    }
    return res;
 }
--- a/utils.h
+++ b/utils.h
@ -18,7 +18,7 @@ struct gpt_params {
    int32_t n_predict = 128; // new tokens to predict
    int32_t repeat_last_n = 64;  // last n tokens to penalize
    int32_t n_ctx = 512; //context size
-    
+
    // sampling parameters
    int32_t top_k = 40;
    float   top_p = 0.95f;