diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile new file mode 100644 index 000000000..618cdddc4 --- /dev/null +++ b/.devops/full.Dockerfile @@ -0,0 +1,17 @@ +ARG UBUNTU_VERSION=22.04 + +FROM ubuntu:$UBUNTU_VERSION as build + +RUN apt-get update && \ + apt-get install -y build-essential python3 python3-pip + +RUN pip install --upgrade pip setuptools wheel \ + && pip install torch torchvision torchaudio sentencepiece numpy + +WORKDIR /app + +COPY . . + +RUN make + +ENTRYPOINT ["/app/.devops/tools.sh"] \ No newline at end of file diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile new file mode 100644 index 000000000..cd575efa0 --- /dev/null +++ b/.devops/main.Dockerfile @@ -0,0 +1,18 @@ +ARG UBUNTU_VERSION=22.04 + +FROM ubuntu:$UBUNTU_VERSION as build + +RUN apt-get update && \ + apt-get install -y build-essential + +WORKDIR /app + +COPY . . + +RUN make + +FROM ubuntu:$UBUNTU_VERSION as runtime + +COPY --from=build /app/main /main + +ENTRYPOINT [ "/main" ] \ No newline at end of file diff --git a/.devops/tools.sh b/.devops/tools.sh new file mode 100755 index 000000000..352e04942 --- /dev/null +++ b/.devops/tools.sh @@ -0,0 +1,46 @@ +#!/bin/bash +set -e + +# Read the first argument into a variable +arg1="$1" + +# Shift the arguments to remove the first one +shift + +# Join the remaining arguments into a single string +arg2="$@" + +if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then + python3 ./convert-pth-to-ggml.py $arg2 +elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then + ./quantize $arg2 +elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then + ./main $arg2 +elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then + python3 ./download-pth.py $arg2 +elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then + echo "Downloading model..." + python3 ./download-pth.py "$1" "$2" + echo "Converting PTH to GGML..." + for i in `ls $1/$2/ggml-model-f16.bin*`; do + if [ -f "${i/f16/q4_0}" ]; then + echo "Skip model quantization, it already exists: ${i/f16/q4_0}" + else + echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." + ./quantize "$i" "${i/f16/q4_0}" 2 + fi + done +else + echo "Unknown command: $arg1" + echo "Available commands: " + echo " --run (-r): Run a model previously converted into ggml" + echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512" + echo " --convert (-c): Convert a llama model into ggml" + echo " ex: \"/models/7B/\" 1" + echo " --quantize (-q): Optimize with quantization process ggml" + echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2" + echo " --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/" + echo " ex: \"/models/\" 7B" + echo " --all-in-one (-a): Execute --download, --convert & --quantize" + echo " ex: \"/models/\" 7B" +fi diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..952990f26 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,24 @@ +*.o +*.a +.cache/ +.vs/ +.vscode/ +.DS_Store + +build/ +build-em/ +build-debug/ +build-release/ +build-static/ +build-no-accel/ +build-sanitize-addr/ +build-sanitize-thread/ + +models/* + +/main +/quantize + +arm_neon.h +compile_commands.json +Dockerfile \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1a068ae75..9c1de5823 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,8 +1,42 @@ name: CI -on: [push, pull_request] + +on: + workflow_dispatch: # allows manual triggering + inputs: + create_release: + description: 'Create new release' + required: true + type: boolean + push: + paths: ['.github/workflows/**', 'CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp'] + pull_request: + types: [opened, synchronize, edited, reopened, review_requested, ready_for_review] + paths: ['CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp'] + +env: + BRANCH_NAME: ${{ github.head_ref || github.ref_name }} jobs: - ubuntu-latest: + ubuntu-latest-make: + runs-on: ubuntu-latest + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v1 + + - name: Dependencies + id: depends + run: | + sudo apt-get update + sudo apt-get install build-essential + + - name: Build + id: make_build + run: | + make + + ubuntu-latest-cmake: runs-on: ubuntu-latest steps: @@ -15,10 +49,31 @@ jobs: sudo apt-get install build-essential - name: Build + run: | + mkdir build + cd build + cmake .. + cmake --build . --config Release + + macOS-latest-make: + runs-on: macos-latest + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v1 + + - name: Dependencies + id: depends + run: | + brew update + + - name: Build + id: make_build run: | make - macOS-latest: + macOS-latest-cmake: runs-on: macOS-latest steps: @@ -31,22 +86,59 @@ jobs: - name: Build run: | - make + mkdir build + cd build + cmake .. + cmake --build . --config Release - windows-latest: + windows-latest-cmake: runs-on: windows-latest steps: - name: Clone + id: checkout uses: actions/checkout@v1 - name: Build + id: cmake_build run: | mkdir build cd build cmake .. cmake --build . --config Release + - name: Get commit hash + id: commit + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: pr-mpt/actions-commit-hash@v2 + + - name: Pack artifacts + id: pack_artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + run: | + 7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip .\build\Release\* + + - name: Create release + id: create_release + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: zendesk/action-create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }} + + - name: Upload release + id: upload_release + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ steps.create_release.outputs.upload_url }} + asset_path: .\llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip + asset_name: llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip + asset_content_type: application/octet-stream + # ubuntu-latest-gcc: # runs-on: ubuntu-latest # diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml new file mode 100644 index 000000000..bc9aff7b7 --- /dev/null +++ b/.github/workflows/docker.yml @@ -0,0 +1,61 @@ +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +# GitHub recommends pinning actions to a commit SHA. +# To get a newer version, you will need to update the SHA. +# You can also reference a tag or branch, but the action may change without warning. + +name: Publish Docker image + +on: + pull_request: + push: + branches: + - master + +jobs: + push_to_registry: + name: Push Docker image to Docker Hub + runs-on: ubuntu-latest + env: + COMMIT_SHA: ${{ github.sha }} + strategy: + matrix: + config: + - { tag: "light", dockerfile: ".devops/main.Dockerfile" } + - { tag: "full", dockerfile: ".devops/full.Dockerfile" } + steps: + - name: Check out the repo + uses: actions/checkout@v3 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Log in to Docker Hub + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push Docker image (versioned) + if: github.event_name == 'push' + uses: docker/build-push-action@v4 + with: + context: . + push: true + tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" + file: ${{ matrix.config.dockerfile }} + + - name: Build and push Docker image (tagged) + uses: docker/build-push-action@v4 + with: + context: . + push: ${{ github.event_name == 'push' }} + tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}" + file: ${{ matrix.config.dockerfile }} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 5eb1ff1b8..3087b0ea5 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,10 @@ models/* /main /quantize +/result arm_neon.h compile_commands.json + +.envrc +.direnv/ diff --git a/CMakeLists.txt b/CMakeLists.txt index ca3be38a5..38e7266dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,8 @@ project("llama.cpp") set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED true) set(CMAKE_C_STANDARD 11) +set(THREADS_PREFER_PTHREAD_FLAG ON) +find_package(Threads REQUIRED) if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) @@ -126,3 +128,4 @@ target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS}) target_include_directories(ggml PUBLIC .) target_link_libraries(quantize PRIVATE ggml) target_link_libraries(llama PRIVATE ggml) +target_link_libraries(ggml PRIVATE Threads::Threads) diff --git a/README.md b/README.md index 15e1b9a2d..1fe5b5426 100644 --- a/README.md +++ b/README.md @@ -32,13 +32,14 @@ Supported platforms: - [X] Mac OS - [X] Linux - [X] Windows (via CMake) +- [X] Docker --- Here is a typical run using LLaMA-7B: ```java -make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 +make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 I llama.cpp build info: I UNAME_S: Darwin I UNAME_P: arm @@ -149,12 +150,24 @@ python3 convert-pth-to-ggml.py models/7B/ 1 ./quantize.sh 7B # run the inference -./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128 +./main -m ./models/7B/ggml-model-q4_0.bin -n 128 ``` +Currently, it's best to use Python 3.9 or Python 3.10, as `sentencepiece` has not yet published a wheel for Python 3.11. + When running the larger models, make sure you have enough disk space to store all the intermediate files. -TODO: add model disk/mem requirements +### Memory/Disk Requirements + +As the models are currently fully loaded into memory, you will need adequate disk space to save them +and sufficient RAM to load them. At the moment, memory and disk requirements are the same. + +| model | original size | quantized size (4-bit) | +|-------|---------------|------------------------| +| 7B | 13 GB | 3.9 GB | +| 13B | 24 GB | 7.8 GB | +| 30B | 60 GB | 19.5 GB | +| 65B | 120 GB | 38.5 GB | ### Interactive mode @@ -163,7 +176,7 @@ In this mode, you can always interrupt generation by pressing Ctrl+C and enter o Here is an example few-shot interaction, invoked with the command ``` -./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 256 --repeat_penalty 1.0 --color -i -r "User:" \ +./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" \ -p \ "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision. @@ -194,6 +207,37 @@ Finally, copy the `llama` binary and the model files to your device storage. Her https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4 +### Docker + +#### Prerequisites +* Docker must be installed and running on your system. +* Create a folder to store big models & intermediate files (in ex. im using /llama/models) + +#### Images +We have two Docker images available for this project: + +1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. +2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. + +#### Usage + +The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image. + + ```bash +docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B +``` + +On complete, you are ready to play! + +```bash +docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 +``` + +or with light image: + +```bash +docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 +``` ## Limitations @@ -210,6 +254,7 @@ https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b0 - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch - Collaborators will be invited based on contributions - Any help with managing issues and PRs is very appreciated! +- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205) ### Coding guidelines diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py index 5c36e9c09..d0eb213c8 100644 --- a/convert-pth-to-ggml.py +++ b/convert-pth-to-ggml.py @@ -16,7 +16,7 @@ # At the start of the ggml file we write the model parameters # and vocabulary. # - +import os import sys import json import struct @@ -64,6 +64,10 @@ if len(sys.argv) > 2: sys.exit(1) fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" +if os.path.exists(fname_out): + print(f"Skip conversion, it already exists: {fname_out}") + sys.exit(0) + with open(fname_hparams, "r") as f: hparams = json.load(f) diff --git a/download-pth.py b/download-pth.py new file mode 100644 index 000000000..129532c0c --- /dev/null +++ b/download-pth.py @@ -0,0 +1,66 @@ +import os +import sys +from tqdm import tqdm +import requests + +if len(sys.argv) < 3: + print("Usage: download-pth.py dir-model model-type\n") + print(" model-type: Available models 7B, 13B, 30B or 65B") + sys.exit(1) + +modelsDir = sys.argv[1] +model = sys.argv[2] + +num = { + "7B": 1, + "13B": 2, + "30B": 4, + "65B": 8, +} + +if model not in num: + print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B") + sys.exit(1) + +print(f"Downloading model {model}") + +files = ["checklist.chk", "params.json"] + +for i in range(num[model]): + files.append(f"consolidated.0{i}.pth") + +resolved_path = os.path.abspath(os.path.join(modelsDir, model)) +os.makedirs(resolved_path, exist_ok=True) + +for file in files: + dest_path = os.path.join(resolved_path, file) + + if os.path.exists(dest_path): + print(f"Skip file download, it already exists: {file}") + continue + + url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}" + response = requests.get(url, stream=True) + with open(dest_path, 'wb') as f: + with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t: + for chunk in response.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + t.update(len(chunk)) + +files2 = ["tokenizer_checklist.chk", "tokenizer.model"] +for file in files2: + dest_path = os.path.join(modelsDir, file) + + if os.path.exists(dest_path): + print(f"Skip file download, it already exists: {file}") + continue + + url = f"https://agi.gpt4.org/llama/LLaMA/{file}" + response = requests.get(url, stream=True) + with open(dest_path, 'wb') as f: + with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t: + for chunk in response.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + t.update(len(chunk)) \ No newline at end of file diff --git a/flake.lock b/flake.lock new file mode 100644 index 000000000..343996da1 --- /dev/null +++ b/flake.lock @@ -0,0 +1,43 @@ +{ + "nodes": { + "flake-utils": { + "locked": { + "lastModified": 1676283394, + "narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1678470307, + "narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 000000000..dae4ff60f --- /dev/null +++ b/flake.nix @@ -0,0 +1,48 @@ +{ + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + flake-utils.url = "github:numtide/flake-utils"; + }; + outputs = { self, nixpkgs, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = import nixpkgs { + inherit system; + }; + llama-python = pkgs.python310.withPackages (ps: with ps; [ + torch + numpy + sentencepiece + ]); + in + { + packages.default = pkgs.stdenv.mkDerivation { + name = "llama.cpp"; + src = ./.; + nativeBuildInputs = with pkgs; [ cmake ]; + buildInputs = with pkgs; lib.optionals stdenv.isDarwin [ + darwin.apple_sdk.frameworks.Accelerate + ]; + cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [ + "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" + ]; + installPhase = '' + mkdir -p $out/bin + mv llama $out/bin/llama + mv quantize $out/bin/quantize + echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml + cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml + chmod +x $out/bin/convert-pth-to-ggml + ''; + }; + devShells.default = pkgs.mkShell { + packages = with pkgs; [ + cmake + llama-python + ] ++ lib.optionals stdenv.isDarwin [ + darwin.apple_sdk.frameworks.Accelerate + ]; + }; + } + ); +} diff --git a/ggml.c b/ggml.c index 535c7b7d2..4fb83adbd 100644 --- a/ggml.c +++ b/ggml.c @@ -607,10 +607,11 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { assert(k % QK == 0); const int nb = k / QK; + const size_t bs = 2*sizeof(float) + QK/2; - float * restrict pm = (float *) (y); - float * restrict pd = (float *) (pm + nb); - uint8_t * restrict pb = (uint8_t *) (pd + nb); + uint8_t * restrict pd = ((uint8_t *)y + 0*bs); + uint8_t * restrict pm = ((uint8_t *)y + 0*bs + sizeof(float)); + uint8_t * restrict pb = ((uint8_t *)y + 0*bs + 2*sizeof(float)); uint8_t pp[QK/2]; @@ -627,8 +628,10 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { const float d = (max - min) / ((1 << 4) - 1); const float id = d ? 1.0f/d : 0.0f; - pm[i] = min; - pd[i] = d; + *(float *)pm = min; + *(float *)pd = d; + pm += bs; + pd += bs; for (int l = 0; l < QK; l += 2) { const float v0 = (x[i*QK + l + 0] - min)*id; @@ -643,7 +646,8 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { pp[l/2] = vi0 | (vi1 << 4); } - memcpy(pb + i*QK/2, pp, sizeof(pp)); + memcpy(pb, pp, sizeof(pp)); + pb += bs; } } @@ -687,16 +691,17 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) { assert(k % QK == 0); const int nb = k / QK; + const size_t bs = 2*sizeof(float) + QK/2; - const float * restrict pm = (const float *) (x); - const float * restrict pd = (const float *) (pm + nb); - const uint8_t * restrict pb = (const uint8_t *) (pd + nb); + const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); + const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float)); + const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float)); for (int i = 0; i < nb; i++) { - const float m = pm[i]; - const float d = pd[i]; + const float d = *(const float *) (pd + i*bs); + const float m = *(const float *) (pm + i*bs); - const uint8_t * restrict pp = pb + i*QK/2; + const uint8_t * restrict pp = pb + i*bs; for (int l = 0; l < QK; l += 2) { const uint8_t vi = pp[l/2]; @@ -1584,28 +1589,109 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict x, const void * restrict y) { const int nb = n / QK; - const float * restrict pm0 = (const float *) x; - const float * restrict pm1 = (const float *) y; + const size_t bs = 2*sizeof(float) + QK/2; - const float * restrict pd0 = (const float *) (pm0 + nb); - const float * restrict pd1 = (const float *) (pm1 + nb); + const uint8_t * restrict pd0 = ((const uint8_t *)x + 0*bs); + const uint8_t * restrict pd1 = ((const uint8_t *)y + 0*bs); - const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb); - const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb); + const uint8_t * restrict pm0 = ((const uint8_t *)x + 0*bs + sizeof(float)); + const uint8_t * restrict pm1 = ((const uint8_t *)y + 0*bs + sizeof(float)); + + const uint8_t * restrict pb0 = ((const uint8_t *)x + 0*bs + 2*sizeof(float)); + const uint8_t * restrict pb1 = ((const uint8_t *)y + 0*bs + 2*sizeof(float)); float sumf = 0.0; -#if 1 +#if defined(__AVX2__) +#if QK == 32 + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + // Accumulator for constant offsets + float acc_offset = 0.0f; + + // Main loop + for (int i = 0; i < nb; ++i) { + const float * m0 = (const float *) (pm0 + i*bs); + const float * m1 = (const float *) (pm1 + i*bs); + + const float * d0 = (const float *) (pd0 + i*bs); + const float * d1 = (const float *) (pd1 + i*bs); + + const uint8_t * restrict p0 = pb0 + i*bs; + const uint8_t * restrict p1 = pb1 + i*bs; + + const __m256 d0v = _mm256_broadcast_ss( d0 ); + const __m256 d1v = _mm256_broadcast_ss( d1 ); + const __m256 m0v = _mm256_broadcast_ss( m0 ); + const __m256 m1v = _mm256_broadcast_ss( m1 ); + + + // Compute combined scale for the block + const __m256 scale_01 = _mm256_mul_ps( d0v, d1v ); + + // Compute cross scales for the block + const __m256 scale_0 = _mm256_mul_ps( d0v, m1v ); + const __m256 scale_1 = _mm256_mul_ps( m0v, d1v ); + const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0b10101010 ); + + // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes + __m256i bx = bytesFromNibbles( p0 ); + __m256i by = bytesFromNibbles( p1 ); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. + + // Sign-extend first 16 signed bytes into int16_t + __m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) ); + __m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) ); + // Compute products of int16_t integers, add pairwise + __m256i i32 = _mm256_madd_epi16( x16, y16 ); + + // Sign-extend last 16 signed bytes into int16_t vectors + __m256i x16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) ); + __m256i y16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) ); + // Accumulate products of int16_t integers + i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16_h, y16_h ) ); + + // compute sums of unsigned bytes in bx, by in blocks of 8. + // This results in a layout like X100 0000 X200 0000 X300 0000 X400 0000, + // which we then interleave as X100 Y100 X200 Y200 X300 Y300 X400 Y400. + // so if we then cast to 8 singles, we get 8 floats like [ x0_7, y0_7, x8_15, y8_15, x16_23, y16_23, x24_31, y24_31 ] + __m256i xsumi = _mm256_sad_epu8( bx, _mm256_setzero_si256() ); + __m256i ysumi = _mm256_sad_epu8( by, _mm256_setzero_si256() ); + __m256i sumsi = _mm256_or_si256( xsumi, _mm256_slli_si256( ysumi, 4 ) ); + __m256 sums = _mm256_cvtepi32_ps( sumsi ); + + // Convert int32_t to float + __m256 p = _mm256_cvtepi32_ps( i32 ); + // Apply the scale, and accumulate + // acc += d0*d1*x*y + d0*m1*x + d1*m0*y + acc = _mm256_fmadd_ps( scale_01, p, acc ); + acc = _mm256_fmadd_ps( cross_scales, sums, acc ); + // acc_offset += m0*m1 (for each entry in the block) + acc_offset += (*m0)*(*m1); + } + + // Return horizontal sum of the acc vector + __m128 res = _mm256_extractf128_ps( acc, 1 ); + res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) ); + res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); + res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); + + sumf = _mm_cvtss_f32( res ) + acc_offset * QK; +#else +#error "not implemented for QK" +#endif +#else // scalar for (int i = 0; i < nb; i++) { - const float m0 = pm0[i]; - const float m1 = pm1[i]; + const float m0 = *(const float *) (pm0 + i*bs); + const float m1 = *(const float *) (pm1 + i*bs); - const float d0 = pd0[i]; - const float d1 = pd1[i]; + const float d0 = *(const float *) (pd0 + i*bs); + const float d1 = *(const float *) (pd1 + i*bs); - const uint8_t * restrict p0 = pb0 + i*QK/2; - const uint8_t * restrict p1 = pb1 + i*QK/2; + const uint8_t * restrict p0 = pb0 + i*bs; + const uint8_t * restrict p1 = pb1 + i*bs; for (int j = 0; j < QK/2; j++) { const uint8_t v0 = p0[j]; @@ -1839,16 +1925,17 @@ inline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * res assert(n % QK == 0); const int nb = n / QK; + const size_t bs = 2*sizeof(float) + QK/2; - const float * restrict pm = (const float *) (x); - const float * restrict pd = (const float *) (pm + nb); - const uint8_t * restrict pb = (const uint8_t *) (pd + nb); + const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); + const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float)); + const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float)); for (int i = 0; i < nb; i++) { - const float m = pm[i]; - const float d = pd[i]; + const float d = *(const float *) (pd + i*bs); + const float m = *(const float *) (pm + i*bs); - const uint8_t * restrict pp = pb + i*QK/2; + const uint8_t * restrict pp = pb + i*bs; for (int l = 0; l < QK; l += 2) { const uint8_t vi = pp[l/2]; @@ -9231,10 +9318,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { - if (cgraph->n_threads <= 0) { - cgraph->n_threads = 8; - } - const int n_threads = cgraph->n_threads; struct ggml_compute_state_shared state_shared = { diff --git a/main.cpp b/main.cpp index 4c2f85e23..a824d46c6 100644 --- a/main.cpp +++ b/main.cpp @@ -143,16 +143,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab // load vocab { - const int32_t n_vocab = model.hparams.n_vocab; - - if (n_vocab != model.hparams.n_vocab) { - fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", - __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); - return false; - } - std::string word; - for (int i = 0; i < n_vocab; i++) { + for (int i = 0; i < model.hparams.n_vocab; i++) { uint32_t len; fin.read((char *) &len, sizeof(len)); @@ -825,6 +817,11 @@ int main(int argc, char ** argv) { if (gpt_params_parse(argc, argv, params) == false) { return 1; } + + if (params.n_ctx > 2048) { + fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" + "expect poor results\n", __func__, params.n_ctx); + } if (params.seed < 0) { params.seed = time(NULL); @@ -870,6 +867,8 @@ int main(int argc, char ** argv) { std::vector logits; + // Add a space in front of the first character to match OG llama tokenizer behavior + params.prompt.insert(0, 1, ' '); // tokenize the prompt std::vector embd_inp = ::llama_tokenize(vocab, params.prompt, true); @@ -1048,7 +1047,7 @@ int main(int argc, char ** argv) { if(params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN); if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) { // presumable empty line, consume the newline - scanf("%*c"); + std::ignore = scanf("%*c"); n_read=0; } if(params.use_color) printf(ANSI_COLOR_RESET); diff --git a/utils.cpp b/utils.cpp index 20b6a86ce..966e9c91f 100644 --- a/utils.cpp +++ b/utils.cpp @@ -16,6 +16,18 @@ #endif bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { + // determine sensible default number of threads. + // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0. +#ifdef __linux__ + std::ifstream cpuinfo("/proc/cpuinfo"); + params.n_threads = std::count(std::istream_iterator(cpuinfo), + std::istream_iterator(), + std::string("processor")); +#endif + if (params.n_threads == 0) { + params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency()); + } + for (int i = 1; i < argc; i++) { std::string arg = argv[i]; @@ -277,40 +289,56 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri return tokens; } +// TODO: Calculate this constant from the vocabulary +#define MAX_TOKEN_LEN 18 +// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) { - //auto res = gpt_tokenize(vocab, text); - - //if (bos) { - // res.insert(res.begin(), 1); // TODO: replace with vocab.bos - //} - std::vector res; + std::vector score; + std::vector prev; + int len = text.length(); + + score.resize(len + 1); + prev.resize(len + 1); + + // Forward pass + for (int i = 0; i < len; i++) { + int max_len = std::min(len - i, MAX_TOKEN_LEN); + for (int sub_len = 1; sub_len <= max_len; sub_len++) { + auto sub = text.substr(i, sub_len); + auto token = vocab.token_to_id.find(sub); + if (token != vocab.token_to_id.end()) { + int token_score = sub.length() * sub.length(); + int local_score = score[i] + token_score; + int next = i + sub_len; + if (score[next] < local_score) { + score[next] = local_score; + prev[next] = (*token).second; + } + } + } + } + + // Backward pass + int i = len; + while (i > 0) { + gpt_vocab::id token_id = prev[i]; + if (token_id == 0) { + // TODO: Return error or something more meaningful + printf("failed to tokenize string!\n"); + break; + } + res.push_back(token_id); + auto token = (*vocab.id_to_token.find(token_id)).second; + i -= token.length(); + } if (bos) { res.push_back(1); // TODO: replace with vocab.bos } - //find the longest token that matches the text - int pos = 0; - while (true) { - int l = 0; - int t = 0; - for (const auto & kv : vocab.id_to_token) { - if (kv.second.size() < l) continue; - if (kv.second.size() > text.size() - pos) continue; - if (text.substr(pos, kv.second.size()) == kv.second) { - l = kv.second.size(); - t = kv.first; - } - } - - if (l == 0) { - break; - } - - res.push_back(t); - pos += l; - } + // Pieces are in reverse order so correct that + std::reverse(res.begin(), res.end()); return res; } @@ -491,7 +519,8 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) { const int nb = k / qk; - const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2); + const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2); + const size_t row_size = nb*bs; assert(k % qk == 0); @@ -500,10 +529,10 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t char * pdst = (char *) dst; - for (int j = 0; j < n; j += k) { - float * pm = (float *) (pdst + (j/k)*row_size); - float * pd = (float *) (pm + nb); - uint8_t * pb = (uint8_t *) (pd + nb); + for (int j = 0; j < n; j += k) { + uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs); + uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float)); + uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float)); //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb); @@ -521,8 +550,10 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t const float d = (max - min) / ((1 << 4) - 1); const float id = d ? 1.0f/d : 0.0f; - pm[i] = min; - pd[i] = d; + *(float *) pd = d; + *(float *) pm = min; + pd += bs; + pm += bs; for (int l = 0; l < qk; l += 2) { const float v0 = (src[j + i*qk + l + 0] - min)*id; @@ -540,7 +571,8 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t pp[l/2] = vi0 | (vi1 << 4); } - memcpy(pb + i*qk/2, pp, pp_size); + memcpy(pb, pp, pp_size); + pb += bs; } } } diff --git a/utils.h b/utils.h index dca497d06..ae263d452 100644 --- a/utils.h +++ b/utils.h @@ -18,7 +18,7 @@ struct gpt_params { int32_t n_predict = 128; // new tokens to predict int32_t repeat_last_n = 64; // last n tokens to penalize int32_t n_ctx = 512; //context size - + // sampling parameters int32_t top_k = 40; float top_p = 0.95f;