From 904d2a8d6acd667c9633138d45a361d40fbf76d0 Mon Sep 17 00:00:00 2001 From: Matvey Soloviev Date: Fri, 17 Mar 2023 05:48:39 +0100 Subject: [PATCH 01/20] Q4_1 quantization (#193) * Add AVX2 version of ggml_vec_dot_q4_1 * Small optimisations to q4_1 dot product (@Const-me) * Rearrange Q4_1 quantization to work for multipart models. (Fix #152) * Fix ggml_vec_mad_q4_1 too * Fix non-vectorised q4_1 vec mul --- ggml.c | 149 ++++++++++++++++++++++++++++++++++++++++++------------ utils.cpp | 20 +++++--- 2 files changed, 130 insertions(+), 39 deletions(-) diff --git a/ggml.c b/ggml.c index 535c7b7d2..c4f838917 100644 --- a/ggml.c +++ b/ggml.c @@ -607,10 +607,11 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { assert(k % QK == 0); const int nb = k / QK; + const size_t bs = 2*sizeof(float) + QK/2; - float * restrict pm = (float *) (y); - float * restrict pd = (float *) (pm + nb); - uint8_t * restrict pb = (uint8_t *) (pd + nb); + uint8_t * restrict pd = ((uint8_t *)y + 0*bs); + uint8_t * restrict pm = ((uint8_t *)y + 0*bs + sizeof(float)); + uint8_t * restrict pb = ((uint8_t *)y + 0*bs + 2*sizeof(float)); uint8_t pp[QK/2]; @@ -627,8 +628,10 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { const float d = (max - min) / ((1 << 4) - 1); const float id = d ? 1.0f/d : 0.0f; - pm[i] = min; - pd[i] = d; + *(float *)pm = min; + *(float *)pd = d; + pm += bs; + pd += bs; for (int l = 0; l < QK; l += 2) { const float v0 = (x[i*QK + l + 0] - min)*id; @@ -643,7 +646,8 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { pp[l/2] = vi0 | (vi1 << 4); } - memcpy(pb + i*QK/2, pp, sizeof(pp)); + memcpy(pb, pp, sizeof(pp)); + pb += bs; } } @@ -687,16 +691,17 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) { assert(k % QK == 0); const int nb = k / QK; + const size_t bs = 2*sizeof(float) + QK/2; - const float * restrict pm = (const float *) (x); - const float * restrict pd = (const float *) (pm + nb); - const uint8_t * restrict pb = (const uint8_t *) (pd + nb); + const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); + const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float)); + const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float)); for (int i = 0; i < nb; i++) { - const float m = pm[i]; - const float d = pd[i]; + const float d = *(const float *) (pd + i*bs); + const float m = *(const float *) (pm + i*bs); - const uint8_t * restrict pp = pb + i*QK/2; + const uint8_t * restrict pp = pb + i*bs; for (int l = 0; l < QK; l += 2) { const uint8_t vi = pp[l/2]; @@ -1584,28 +1589,109 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict x, const void * restrict y) { const int nb = n / QK; - const float * restrict pm0 = (const float *) x; - const float * restrict pm1 = (const float *) y; + const size_t bs = 2*sizeof(float) + QK/2; - const float * restrict pd0 = (const float *) (pm0 + nb); - const float * restrict pd1 = (const float *) (pm1 + nb); + const uint8_t * restrict pd0 = ((const uint8_t *)x + 0*bs); + const uint8_t * restrict pd1 = ((const uint8_t *)y + 0*bs); - const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb); - const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb); + const uint8_t * restrict pm0 = ((const uint8_t *)x + 0*bs + sizeof(float)); + const uint8_t * restrict pm1 = ((const uint8_t *)y + 0*bs + sizeof(float)); + + const uint8_t * restrict pb0 = ((const uint8_t *)x + 0*bs + 2*sizeof(float)); + const uint8_t * restrict pb1 = ((const uint8_t *)y + 0*bs + 2*sizeof(float)); float sumf = 0.0; -#if 1 +#if defined(__AVX2__) +#if QK == 32 + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + // Accumulator for constant offsets + float acc_offset = 0.0f; + + // Main loop + for (int i = 0; i < nb; ++i) { + const float * m0 = (const float *) (pm0 + i*bs); + const float * m1 = (const float *) (pm1 + i*bs); + + const float * d0 = (const float *) (pd0 + i*bs); + const float * d1 = (const float *) (pd1 + i*bs); + + const uint8_t * restrict p0 = pb0 + i*bs; + const uint8_t * restrict p1 = pb1 + i*bs; + + const __m256 d0v = _mm256_broadcast_ss( d0 ); + const __m256 d1v = _mm256_broadcast_ss( d1 ); + const __m256 m0v = _mm256_broadcast_ss( m0 ); + const __m256 m1v = _mm256_broadcast_ss( m1 ); + + + // Compute combined scale for the block + const __m256 scale_01 = _mm256_mul_ps( d0v, d1v ); + + // Compute cross scales for the block + const __m256 scale_0 = _mm256_mul_ps( d0v, m1v ); + const __m256 scale_1 = _mm256_mul_ps( m0v, d1v ); + const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0b10101010 ); + + // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes + __m256i bx = bytesFromNibbles( p0 ); + __m256i by = bytesFromNibbles( p1 ); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. + + // Sign-extend first 16 signed bytes into int16_t + __m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) ); + __m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) ); + // Compute products of int16_t integers, add pairwise + __m256i i32 = _mm256_madd_epi16( x16, y16 ); + + // Sign-extend last 16 signed bytes into int16_t vectors + __m256i x16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) ); + __m256i y16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) ); + // Accumulate products of int16_t integers + i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16_h, y16_h ) ); + + // compute sums of unsigned bytes in bx, by in blocks of 8. + // This results in a layout like X100 0000 X200 0000 X300 0000 X400 0000, + // which we then interleave as X100 Y100 X200 Y200 X300 Y300 X400 Y400. + // so if we then cast to 8 singles, we get 8 floats like [ x0_7, y0_7, x8_15, y8_15, x16_23, y16_23, x24_31, y24_31 ] + __m256i xsumi = _mm256_sad_epu8( bx, _mm256_setzero_si256() ); + __m256i ysumi = _mm256_sad_epu8( by, _mm256_setzero_si256() ); + __m256i sumsi = _mm256_or_si256( xsumi, _mm256_slli_si256( ysumi, 4 ) ); + __m256 sums = _mm256_cvtepi32_ps( sumsi ); + + // Convert int32_t to float + __m256 p = _mm256_cvtepi32_ps( i32 ); + // Apply the scale, and accumulate + // acc += d0*d1*x*y + d0*m1*x + d1*m0*y + acc = _mm256_fmadd_ps( scale_01, p, acc ); + acc = _mm256_fmadd_ps( cross_scales, sums, acc ); + // acc_offset += m0*m1 (for each entry in the block) + acc_offset += (*m0)*(*m1); + } + + // Return horizontal sum of the acc vector + __m128 res = _mm256_extractf128_ps( acc, 1 ); + res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) ); + res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); + res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); + + sumf = _mm_cvtss_f32( res ) + acc_offset * QK; +#else +#error "not implemented for QK" +#endif +#else // scalar for (int i = 0; i < nb; i++) { - const float m0 = pm0[i]; - const float m1 = pm1[i]; + const float m0 = *(const float *) (pm0 + i*bs); + const float m1 = *(const float *) (pm1 + i*bs); - const float d0 = pd0[i]; - const float d1 = pd1[i]; + const float d0 = *(const float *) (pd0 + i*bs); + const float d1 = *(const float *) (pd1 + i*bs); - const uint8_t * restrict p0 = pb0 + i*QK/2; - const uint8_t * restrict p1 = pb1 + i*QK/2; + const uint8_t * restrict p0 = pb0 + i*bs; + const uint8_t * restrict p1 = pb1 + i*bs; for (int j = 0; j < QK/2; j++) { const uint8_t v0 = p0[j]; @@ -1839,16 +1925,17 @@ inline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * res assert(n % QK == 0); const int nb = n / QK; + const size_t bs = 2*sizeof(float) + QK/2; - const float * restrict pm = (const float *) (x); - const float * restrict pd = (const float *) (pm + nb); - const uint8_t * restrict pb = (const uint8_t *) (pd + nb); + const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); + const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float)); + const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float)); for (int i = 0; i < nb; i++) { - const float m = pm[i]; - const float d = pd[i]; + const float d = *(const float *) (pd + i*bs); + const float m = *(const float *) (pm + i*bs); - const uint8_t * restrict pp = pb + i*QK/2; + const uint8_t * restrict pp = pb + i*bs; for (int l = 0; l < QK; l += 2) { const uint8_t vi = pp[l/2]; diff --git a/utils.cpp b/utils.cpp index aa3ad1053..26e313d5f 100644 --- a/utils.cpp +++ b/utils.cpp @@ -489,7 +489,8 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) { const int nb = k / qk; - const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2); + const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2); + const size_t row_size = nb*bs; assert(k % qk == 0); @@ -498,10 +499,10 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t char * pdst = (char *) dst; - for (int j = 0; j < n; j += k) { - float * pm = (float *) (pdst + (j/k)*row_size); - float * pd = (float *) (pm + nb); - uint8_t * pb = (uint8_t *) (pd + nb); + for (int j = 0; j < n; j += k) { + uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs); + uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float)); + uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float)); //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb); @@ -519,8 +520,10 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t const float d = (max - min) / ((1 << 4) - 1); const float id = d ? 1.0f/d : 0.0f; - pm[i] = min; - pd[i] = d; + *(float *) pd = d; + *(float *) pm = min; + pd += bs; + pm += bs; for (int l = 0; l < qk; l += 2) { const float v0 = (src[j + i*qk + l + 0] - min)*id; @@ -538,7 +541,8 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t pp[l/2] = vi0 | (vi1 << 4); } - memcpy(pb + i*qk/2, pp, pp_size); + memcpy(pb, pp, pp_size); + pb += bs; } } } From 2af23d30434a677c6416812eea52ccc0af65119c Mon Sep 17 00:00:00 2001 From: Bernat Vadell Date: Fri, 17 Mar 2023 10:47:06 +0100 Subject: [PATCH 02/20] =?UTF-8?q?=F0=9F=9A=80=20Dockerize=20llamacpp=20(#1?= =?UTF-8?q?32)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: dockerize llamacpp * feat: split build & runtime stages * split dockerfile into main & tools * add quantize into tool docker image * Update .devops/tools.sh Co-authored-by: Georgi Gerganov * add docker action pipeline * change CI to publish at github docker registry * fix name runs-on macOS-latest is macos-latest (lowercase) * include docker versioned images * fix github action docker * fix docker.yml * feat: include all-in-one command tool & update readme.md --------- Co-authored-by: Georgi Gerganov --- .devops/full.Dockerfile | 17 ++++++++++ .devops/main.Dockerfile | 18 ++++++++++ .devops/tools.sh | 46 +++++++++++++++++++++++++ .dockerignore | 24 +++++++++++++ .github/workflows/build.yml | 2 +- .github/workflows/docker.yml | 61 +++++++++++++++++++++++++++++++++ README.md | 32 +++++++++++++++++ convert-pth-to-ggml.py | 6 +++- download-pth.py | 66 ++++++++++++++++++++++++++++++++++++ 9 files changed, 270 insertions(+), 2 deletions(-) create mode 100644 .devops/full.Dockerfile create mode 100644 .devops/main.Dockerfile create mode 100755 .devops/tools.sh create mode 100644 .dockerignore create mode 100644 .github/workflows/docker.yml create mode 100644 download-pth.py diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile new file mode 100644 index 000000000..618cdddc4 --- /dev/null +++ b/.devops/full.Dockerfile @@ -0,0 +1,17 @@ +ARG UBUNTU_VERSION=22.04 + +FROM ubuntu:$UBUNTU_VERSION as build + +RUN apt-get update && \ + apt-get install -y build-essential python3 python3-pip + +RUN pip install --upgrade pip setuptools wheel \ + && pip install torch torchvision torchaudio sentencepiece numpy + +WORKDIR /app + +COPY . . + +RUN make + +ENTRYPOINT ["/app/.devops/tools.sh"] \ No newline at end of file diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile new file mode 100644 index 000000000..cd575efa0 --- /dev/null +++ b/.devops/main.Dockerfile @@ -0,0 +1,18 @@ +ARG UBUNTU_VERSION=22.04 + +FROM ubuntu:$UBUNTU_VERSION as build + +RUN apt-get update && \ + apt-get install -y build-essential + +WORKDIR /app + +COPY . . + +RUN make + +FROM ubuntu:$UBUNTU_VERSION as runtime + +COPY --from=build /app/main /main + +ENTRYPOINT [ "/main" ] \ No newline at end of file diff --git a/.devops/tools.sh b/.devops/tools.sh new file mode 100755 index 000000000..b5711c94e --- /dev/null +++ b/.devops/tools.sh @@ -0,0 +1,46 @@ +#!/bin/bash +set -e + +# Read the first argument into a variable +arg1="$1" + +# Shift the arguments to remove the first one +shift + +# Join the remaining arguments into a single string +arg2="$@" + +if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then + python3 ./convert-pth-to-ggml.py $arg2 +elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then + ./quantize $arg2 +elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then + ./main $arg2 +elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then + python3 ./download-pth.py $arg2 +elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then + echo "Downloading model..." + python3 ./download-pth.py "$1" "$2" + echo "Converting PTH to GGML..." + for i in `ls $1/$2/ggml-model-f16.bin*`; do + if [ -f "${i/f16/q4_0}" ]; then + echo "Skip model quantization, it already exists: ${i/f16/q4_0}" + else + echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." + ./quantize "$i" "${i/f16/q4_0}" 2 + fi + done +else + echo "Unknown command: $arg1" + echo "Available commands: " + echo " --run (-r): Run a model previously converted into ggml" + echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -t 8 -n 512" + echo " --convert (-c): Convert a llama model into ggml" + echo " ex: \"/models/7B/\" 1" + echo " --quantize (-q): Optimize with quantization process ggml" + echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2" + echo " --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/" + echo " ex: \"/models/\" 7B" + echo " --all-in-one (-a): Execute --download, --convert & --quantize" + echo " ex: \"/models/\" 7B" +fi diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..952990f26 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,24 @@ +*.o +*.a +.cache/ +.vs/ +.vscode/ +.DS_Store + +build/ +build-em/ +build-debug/ +build-release/ +build-static/ +build-no-accel/ +build-sanitize-addr/ +build-sanitize-thread/ + +models/* + +/main +/quantize + +arm_neon.h +compile_commands.json +Dockerfile \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1a068ae75..94f199cb8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,7 +19,7 @@ jobs: make macOS-latest: - runs-on: macOS-latest + runs-on: macos-latest steps: - name: Clone diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml new file mode 100644 index 000000000..bc9aff7b7 --- /dev/null +++ b/.github/workflows/docker.yml @@ -0,0 +1,61 @@ +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +# GitHub recommends pinning actions to a commit SHA. +# To get a newer version, you will need to update the SHA. +# You can also reference a tag or branch, but the action may change without warning. + +name: Publish Docker image + +on: + pull_request: + push: + branches: + - master + +jobs: + push_to_registry: + name: Push Docker image to Docker Hub + runs-on: ubuntu-latest + env: + COMMIT_SHA: ${{ github.sha }} + strategy: + matrix: + config: + - { tag: "light", dockerfile: ".devops/main.Dockerfile" } + - { tag: "full", dockerfile: ".devops/full.Dockerfile" } + steps: + - name: Check out the repo + uses: actions/checkout@v3 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Log in to Docker Hub + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push Docker image (versioned) + if: github.event_name == 'push' + uses: docker/build-push-action@v4 + with: + context: . + push: true + tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" + file: ${{ matrix.config.dockerfile }} + + - name: Build and push Docker image (tagged) + uses: docker/build-push-action@v4 + with: + context: . + push: ${{ github.event_name == 'push' }} + tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}" + file: ${{ matrix.config.dockerfile }} \ No newline at end of file diff --git a/README.md b/README.md index 15e1b9a2d..8cf59f418 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ Supported platforms: - [X] Mac OS - [X] Linux - [X] Windows (via CMake) +- [X] Docker --- @@ -194,6 +195,37 @@ Finally, copy the `llama` binary and the model files to your device storage. Her https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4 +### Docker + +#### Prerequisites +* Docker must be installed and running on your system. +* Create a folder to store big models & intermediate files (in ex. im using /llama/models) + +#### Images +We have two Docker images available for this project: + +1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. +2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. + +#### Usage + +The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image. + + ```bash +docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B +``` + +On complete, you are ready to play! + +```bash +docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 +``` + +or with light image: + +```bash +docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 +``` ## Limitations diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py index 5c36e9c09..d0eb213c8 100644 --- a/convert-pth-to-ggml.py +++ b/convert-pth-to-ggml.py @@ -16,7 +16,7 @@ # At the start of the ggml file we write the model parameters # and vocabulary. # - +import os import sys import json import struct @@ -64,6 +64,10 @@ if len(sys.argv) > 2: sys.exit(1) fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" +if os.path.exists(fname_out): + print(f"Skip conversion, it already exists: {fname_out}") + sys.exit(0) + with open(fname_hparams, "r") as f: hparams = json.load(f) diff --git a/download-pth.py b/download-pth.py new file mode 100644 index 000000000..129532c0c --- /dev/null +++ b/download-pth.py @@ -0,0 +1,66 @@ +import os +import sys +from tqdm import tqdm +import requests + +if len(sys.argv) < 3: + print("Usage: download-pth.py dir-model model-type\n") + print(" model-type: Available models 7B, 13B, 30B or 65B") + sys.exit(1) + +modelsDir = sys.argv[1] +model = sys.argv[2] + +num = { + "7B": 1, + "13B": 2, + "30B": 4, + "65B": 8, +} + +if model not in num: + print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B") + sys.exit(1) + +print(f"Downloading model {model}") + +files = ["checklist.chk", "params.json"] + +for i in range(num[model]): + files.append(f"consolidated.0{i}.pth") + +resolved_path = os.path.abspath(os.path.join(modelsDir, model)) +os.makedirs(resolved_path, exist_ok=True) + +for file in files: + dest_path = os.path.join(resolved_path, file) + + if os.path.exists(dest_path): + print(f"Skip file download, it already exists: {file}") + continue + + url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}" + response = requests.get(url, stream=True) + with open(dest_path, 'wb') as f: + with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t: + for chunk in response.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + t.update(len(chunk)) + +files2 = ["tokenizer_checklist.chk", "tokenizer.model"] +for file in files2: + dest_path = os.path.join(modelsDir, file) + + if os.path.exists(dest_path): + print(f"Skip file download, it already exists: {file}") + continue + + url = f"https://agi.gpt4.org/llama/LLaMA/{file}" + response = requests.get(url, stream=True) + with open(dest_path, 'wb') as f: + with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t: + for chunk in response.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + t.update(len(chunk)) \ No newline at end of file From 6b0df5ccf360fe5c015f6607f0375bfc6849005e Mon Sep 17 00:00:00 2001 From: mmyjona Date: Sat, 18 Mar 2023 00:38:24 +0800 Subject: [PATCH 03/20] add ptread link to fix cmake build under linux (#114) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add ptread link to fix cmake build under linux * add cmake to linux and macos platform * separate make and cmake workflow --------- Co-authored-by: Sebastián A --- .github/workflows/build.yml | 43 ++++++++++++++++++++++++++++++++++--- CMakeLists.txt | 3 +++ 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 94f199cb8..a94a38991 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -2,7 +2,7 @@ name: CI on: [push, pull_request] jobs: - ubuntu-latest: + ubuntu-latest-make: runs-on: ubuntu-latest steps: @@ -18,7 +18,26 @@ jobs: run: | make - macOS-latest: + ubuntu-latest-cmake: + runs-on: ubuntu-latest + + steps: + - name: Clone + uses: actions/checkout@v1 + + - name: Dependencies + run: | + sudo apt-get update + sudo apt-get install build-essential + + - name: Build + run: | + mkdir build + cd build + cmake .. + cmake --build . --config Release + + macOS-latest-make: runs-on: macos-latest steps: @@ -33,7 +52,25 @@ jobs: run: | make - windows-latest: + macOS-latest-cmake: + runs-on: macOS-latest + + steps: + - name: Clone + uses: actions/checkout@v1 + + - name: Dependencies + run: | + brew update + + - name: Build + run: | + mkdir build + cd build + cmake .. + cmake --build . --config Release + + windows-latest-cmake: runs-on: windows-latest steps: diff --git a/CMakeLists.txt b/CMakeLists.txt index ca3be38a5..38e7266dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,8 @@ project("llama.cpp") set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED true) set(CMAKE_C_STANDARD 11) +set(THREADS_PREFER_PTHREAD_FLAG ON) +find_package(Threads REQUIRED) if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) @@ -126,3 +128,4 @@ target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS}) target_include_directories(ggml PUBLIC .) target_link_libraries(quantize PRIVATE ggml) target_link_libraries(llama PRIVATE ggml) +target_link_libraries(ggml PRIVATE Threads::Threads) From 367946c668757532deed929e1d78673c6ac6bcb8 Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Fri, 17 Mar 2023 17:47:35 +0000 Subject: [PATCH 04/20] Don't tell users to use a bad number of threads (#243) The readme tells people to use the command line option "-t 8", causing 8 threads to be started. On systems with fewer than 8 cores, this causes a significant slowdown. Remove the option from the example command lines and use /proc/cpuinfo on Linux to determine a sensible default. --- .devops/tools.sh | 2 +- README.md | 10 +++++----- ggml.c | 4 ---- utils.cpp | 12 ++++++++++++ utils.h | 2 +- 5 files changed, 19 insertions(+), 11 deletions(-) diff --git a/.devops/tools.sh b/.devops/tools.sh index b5711c94e..352e04942 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -34,7 +34,7 @@ else echo "Unknown command: $arg1" echo "Available commands: " echo " --run (-r): Run a model previously converted into ggml" - echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -t 8 -n 512" + echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512" echo " --convert (-c): Convert a llama model into ggml" echo " ex: \"/models/7B/\" 1" echo " --quantize (-q): Optimize with quantization process ggml" diff --git a/README.md b/README.md index 8cf59f418..7338ea790 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ Supported platforms: Here is a typical run using LLaMA-7B: ```java -make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 +make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 I llama.cpp build info: I UNAME_S: Darwin I UNAME_P: arm @@ -150,7 +150,7 @@ python3 convert-pth-to-ggml.py models/7B/ 1 ./quantize.sh 7B # run the inference -./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128 +./main -m ./models/7B/ggml-model-q4_0.bin -n 128 ``` When running the larger models, make sure you have enough disk space to store all the intermediate files. @@ -164,7 +164,7 @@ In this mode, you can always interrupt generation by pressing Ctrl+C and enter o Here is an example few-shot interaction, invoked with the command ``` -./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 256 --repeat_penalty 1.0 --color -i -r "User:" \ +./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" \ -p \ "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision. @@ -218,13 +218,13 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-on On complete, you are ready to play! ```bash -docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 +docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 ``` or with light image: ```bash -docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 +docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 ``` ## Limitations diff --git a/ggml.c b/ggml.c index c4f838917..4fb83adbd 100644 --- a/ggml.c +++ b/ggml.c @@ -9318,10 +9318,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { - if (cgraph->n_threads <= 0) { - cgraph->n_threads = 8; - } - const int n_threads = cgraph->n_threads; struct ggml_compute_state_shared state_shared = { diff --git a/utils.cpp b/utils.cpp index 26e313d5f..9e50487ef 100644 --- a/utils.cpp +++ b/utils.cpp @@ -16,6 +16,18 @@ #endif bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { + // determine sensible default number of threads. + // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0. +#ifdef __linux__ + std::ifstream cpuinfo("/proc/cpuinfo"); + params.n_threads = std::count(std::istream_iterator(cpuinfo), + std::istream_iterator(), + std::string("processor")); +#endif + if (params.n_threads == 0) { + params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency()); + } + for (int i = 1; i < argc; i++) { std::string arg = argv[i]; diff --git a/utils.h b/utils.h index 021120b05..5e5b40ffa 100644 --- a/utils.h +++ b/utils.h @@ -14,7 +14,7 @@ struct gpt_params { int32_t seed = -1; // RNG seed - int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); + int32_t n_threads; int32_t n_predict = 128; // new tokens to predict int32_t repeat_last_n = 64; // last n tokens to penalize int32_t n_ctx = 512; //context size From e81b9c81c101f64531ef0fa1ee6b77d562635652 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 17 Mar 2023 20:30:04 +0200 Subject: [PATCH 05/20] Update Contributing section --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 7338ea790..808d54e89 100644 --- a/README.md +++ b/README.md @@ -242,6 +242,7 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch - Collaborators will be invited based on contributions - Any help with managing issues and PRs is very appreciated! +- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205) ### Coding guidelines From 4f546091102a418ffdc6230f872ac56e5cedb835 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 17 Mar 2023 21:46:46 +0200 Subject: [PATCH 06/20] Default to 4 threads (#243) --- utils.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils.h b/utils.h index 5e5b40ffa..c1a8498a7 100644 --- a/utils.h +++ b/utils.h @@ -14,11 +14,11 @@ struct gpt_params { int32_t seed = -1; // RNG seed - int32_t n_threads; + int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); int32_t n_predict = 128; // new tokens to predict int32_t repeat_last_n = 64; // last n tokens to penalize int32_t n_ctx = 512; //context size - + // sampling parameters int32_t top_k = 40; float top_p = 0.95f; From c9f670a17755311aa28c411f5c7f3c8c05434770 Mon Sep 17 00:00:00 2001 From: thement <40525767+thement@users.noreply.github.com> Date: Fri, 17 Mar 2023 21:05:58 +0100 Subject: [PATCH 07/20] Implement non-greedy tokenizer that tries to maximize token lengths (#242) * Implement non-greedy tokenizer that tries to maximize token lengths * Insert single space in front of the prompt - this is to match original llama tokenizer behavior --------- Co-authored-by: Jakub Horak --- main.cpp | 2 ++ utils.cpp | 70 ++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 45 insertions(+), 27 deletions(-) diff --git a/main.cpp b/main.cpp index ca0fca8b3..39c5d7b76 100644 --- a/main.cpp +++ b/main.cpp @@ -845,6 +845,8 @@ int main(int argc, char ** argv) { std::vector logits; + // Add a space in front of the first character to match OG llama tokenizer behavior + params.prompt.insert(0, 1, ' '); // tokenize the prompt std::vector embd_inp = ::llama_tokenize(vocab, params.prompt, true); diff --git a/utils.cpp b/utils.cpp index 9e50487ef..22ef59377 100644 --- a/utils.cpp +++ b/utils.cpp @@ -287,40 +287,56 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri return tokens; } +// TODO: Calculate this constant from the vocabulary +#define MAX_TOKEN_LEN 18 +// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) { - //auto res = gpt_tokenize(vocab, text); - - //if (bos) { - // res.insert(res.begin(), 1); // TODO: replace with vocab.bos - //} - std::vector res; + std::vector score; + std::vector prev; + int len = text.length(); + + score.resize(len + 1); + prev.resize(len + 1); + + // Forward pass + for (int i = 0; i < len; i++) { + int max_len = std::min(len - i, MAX_TOKEN_LEN); + for (int sub_len = 1; sub_len <= len - i; sub_len++) { + auto sub = text.substr(i, sub_len); + auto token = vocab.token_to_id.find(sub); + if (token != vocab.token_to_id.end()) { + int token_score = sub.length() * sub.length(); + int local_score = score[i] + token_score; + int next = i + sub_len; + if (score[next] < local_score) { + score[next] = local_score; + prev[next] = (*token).second; + } + } + } + } + + // Backward pass + int i = len; + while (i > 0) { + gpt_vocab::id token_id = prev[i]; + if (token_id == 0) { + // TODO: Return error or something more meaningful + printf("failed to tokenize string!\n"); + break; + } + res.push_back(token_id); + auto token = (*vocab.id_to_token.find(token_id)).second; + i -= token.length(); + } if (bos) { res.push_back(1); // TODO: replace with vocab.bos } - //find the longest token that matches the text - int pos = 0; - while (true) { - int l = 0; - int t = 0; - for (const auto & kv : vocab.id_to_token) { - if (kv.second.size() < l) continue; - if (kv.second.size() > text.size() - pos) continue; - if (text.substr(pos, kv.second.size()) == kv.second) { - l = kv.second.size(); - t = kv.first; - } - } - - if (l == 0) { - break; - } - - res.push_back(t); - pos += l; - } + // Pieces are in reverse order so correct that + std::reverse(res.begin(), res.end()); return res; } From a29274789309029fd88a9465e6d0832d4632272b Mon Sep 17 00:00:00 2001 From: Niklas Korz Date: Fri, 17 Mar 2023 23:03:48 +0100 Subject: [PATCH 08/20] Nix flake (#40) * Nix flake * Nix: only add Accelerate framework on macOS * Nix: development shel, direnv and compatibility * Nix: use python packages supplied by withPackages * Nix: remove channel compatibility * Nix: fix ARM neon dotproduct on macOS --------- Co-authored-by: Pavol Rusnak --- .gitignore | 4 ++++ flake.lock | 43 +++++++++++++++++++++++++++++++++++++++++++ flake.nix | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+) create mode 100644 flake.lock create mode 100644 flake.nix diff --git a/.gitignore b/.gitignore index 5eb1ff1b8..3087b0ea5 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,10 @@ models/* /main /quantize +/result arm_neon.h compile_commands.json + +.envrc +.direnv/ diff --git a/flake.lock b/flake.lock new file mode 100644 index 000000000..343996da1 --- /dev/null +++ b/flake.lock @@ -0,0 +1,43 @@ +{ + "nodes": { + "flake-utils": { + "locked": { + "lastModified": 1676283394, + "narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1678470307, + "narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 000000000..dae4ff60f --- /dev/null +++ b/flake.nix @@ -0,0 +1,48 @@ +{ + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + flake-utils.url = "github:numtide/flake-utils"; + }; + outputs = { self, nixpkgs, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = import nixpkgs { + inherit system; + }; + llama-python = pkgs.python310.withPackages (ps: with ps; [ + torch + numpy + sentencepiece + ]); + in + { + packages.default = pkgs.stdenv.mkDerivation { + name = "llama.cpp"; + src = ./.; + nativeBuildInputs = with pkgs; [ cmake ]; + buildInputs = with pkgs; lib.optionals stdenv.isDarwin [ + darwin.apple_sdk.frameworks.Accelerate + ]; + cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [ + "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" + ]; + installPhase = '' + mkdir -p $out/bin + mv llama $out/bin/llama + mv quantize $out/bin/quantize + echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml + cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml + chmod +x $out/bin/convert-pth-to-ggml + ''; + }; + devShells.default = pkgs.mkShell { + packages = with pkgs; [ + cmake + llama-python + ] ++ lib.optionals stdenv.isDarwin [ + darwin.apple_sdk.frameworks.Accelerate + ]; + }; + } + ); +} From b2de7f18dfbb93463eeb5b4392117bbe82d5bd1b Mon Sep 17 00:00:00 2001 From: anzz1 Date: Sat, 18 Mar 2023 09:27:12 +0200 Subject: [PATCH 09/20] CI Improvements (#230) * CI Improvements Manual build feature, autoreleases for Windows * better CI naming convention use branch name in releases and tags --- .github/workflows/build.yml | 57 ++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a94a38991..9c1de5823 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,5 +1,20 @@ name: CI -on: [push, pull_request] + +on: + workflow_dispatch: # allows manual triggering + inputs: + create_release: + description: 'Create new release' + required: true + type: boolean + push: + paths: ['.github/workflows/**', 'CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp'] + pull_request: + types: [opened, synchronize, edited, reopened, review_requested, ready_for_review] + paths: ['CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp'] + +env: + BRANCH_NAME: ${{ github.head_ref || github.ref_name }} jobs: ubuntu-latest-make: @@ -7,14 +22,17 @@ jobs: steps: - name: Clone + id: checkout uses: actions/checkout@v1 - name: Dependencies + id: depends run: | sudo apt-get update sudo apt-get install build-essential - name: Build + id: make_build run: | make @@ -42,13 +60,16 @@ jobs: steps: - name: Clone + id: checkout uses: actions/checkout@v1 - name: Dependencies + id: depends run: | brew update - name: Build + id: make_build run: | make @@ -75,15 +96,49 @@ jobs: steps: - name: Clone + id: checkout uses: actions/checkout@v1 - name: Build + id: cmake_build run: | mkdir build cd build cmake .. cmake --build . --config Release + - name: Get commit hash + id: commit + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: pr-mpt/actions-commit-hash@v2 + + - name: Pack artifacts + id: pack_artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + run: | + 7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip .\build\Release\* + + - name: Create release + id: create_release + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: zendesk/action-create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }} + + - name: Upload release + id: upload_release + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ steps.create_release.outputs.upload_url }} + asset_path: .\llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip + asset_name: llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip + asset_content_type: application/octet-stream + # ubuntu-latest-gcc: # runs-on: ubuntu-latest # From a81d0c2a171a4446e6a21a3ec74a0c0768d71184 Mon Sep 17 00:00:00 2001 From: Gary Linscott Date: Sat, 18 Mar 2023 04:17:19 -0700 Subject: [PATCH 10/20] Fix n^2 loop in tokenization (#254) This causes long prompts to parse very slowly. --- utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils.cpp b/utils.cpp index 22ef59377..efa2e3c35 100644 --- a/utils.cpp +++ b/utils.cpp @@ -302,7 +302,7 @@ std::vector llama_tokenize(const gpt_vocab & vocab, const std::st // Forward pass for (int i = 0; i < len; i++) { int max_len = std::min(len - i, MAX_TOKEN_LEN); - for (int sub_len = 1; sub_len <= len - i; sub_len++) { + for (int sub_len = 1; sub_len <= max_len; sub_len++) { auto sub = text.substr(i, sub_len); auto token = vocab.token_to_id.find(sub); if (token != vocab.token_to_id.end()) { From e03e359730c127f888fcf00e93375771bc0a3500 Mon Sep 17 00:00:00 2001 From: Justin Suess Date: Sat, 18 Mar 2023 07:44:09 -0400 Subject: [PATCH 11/20] fixed warning with std::ignore about unused function result (#151) fixed warning with std::ignore about unused function result --- main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.cpp b/main.cpp index 39c5d7b76..eb78fe5ab 100644 --- a/main.cpp +++ b/main.cpp @@ -1011,7 +1011,7 @@ int main(int argc, char ** argv) { if(params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN); if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) { // presumable empty line, consume the newline - scanf("%*c"); + std::ignore = scanf("%*c"); n_read=0; } if(params.use_color) printf(ANSI_COLOR_RESET); From d3f202d57b694376cef6f381a6b6901825c3f6d9 Mon Sep 17 00:00:00 2001 From: Alex Nguyen Date: Sat, 18 Mar 2023 20:51:49 +0700 Subject: [PATCH 12/20] Remove unused code since n_vocab is model.hparams.n_vocab (#262) --- main.cpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/main.cpp b/main.cpp index eb78fe5ab..c88405b82 100644 --- a/main.cpp +++ b/main.cpp @@ -143,16 +143,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab // load vocab { - const int32_t n_vocab = model.hparams.n_vocab; - - if (n_vocab != model.hparams.n_vocab) { - fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", - __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); - return false; - } - std::string word; - for (int i = 0; i < n_vocab; i++) { + for (int i = 0; i < model.hparams.n_vocab; i++) { uint32_t len; fin.read((char *) &len, sizeof(len)); From 554b54152145c30618bac171efb712cf4a7d1e96 Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Sat, 18 Mar 2023 21:58:46 +0100 Subject: [PATCH 13/20] Add memory/disk requirements to readme --- README.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 808d54e89..fc8b2fda3 100644 --- a/README.md +++ b/README.md @@ -155,7 +155,17 @@ python3 convert-pth-to-ggml.py models/7B/ 1 When running the larger models, make sure you have enough disk space to store all the intermediate files. -TODO: add model disk/mem requirements +### Memory/Disk Requirements + +As the models are currently fully loaded into memory, you will need adequate disk space to save them +and sufficient RAM to load them. At the moment, memory and disk requirements are the same. + +| model | original size | quantized size (4-bit) | +|-------|---------------|------------------------| +| 7B | 13 GB | 3.9 GB | +| 15B | 24 GB | 7.8 GB | +| 30B | 60 GB | 19.5 GB | +| 65B | 120 GB | 38.5 GB | ### Interactive mode From 1e5a6d088d0f3a967c6e86298a756daec9e8df12 Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Sat, 18 Mar 2023 22:20:04 +0100 Subject: [PATCH 14/20] Add note about Python 3.11 to readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index fc8b2fda3..187f82f61 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,8 @@ python3 convert-pth-to-ggml.py models/7B/ 1 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128 ``` +Currently, it's best to use Python 3.9 or Python 3.10, as `sentencepiece` has not yet published a wheel for Python 3.11. + When running the larger models, make sure you have enough disk space to store all the intermediate files. ### Memory/Disk Requirements From 6f61c18ec9a30416e21ed5abfb1321bdb14979be Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Sat, 18 Mar 2023 22:39:46 +0100 Subject: [PATCH 15/20] Fix typo in readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 187f82f61..1fe5b5426 100644 --- a/README.md +++ b/README.md @@ -165,7 +165,7 @@ and sufficient RAM to load them. At the moment, memory and disk requirements are | model | original size | quantized size (4-bit) | |-------|---------------|------------------------| | 7B | 13 GB | 3.9 GB | -| 15B | 24 GB | 7.8 GB | +| 13B | 24 GB | 7.8 GB | | 30B | 60 GB | 19.5 GB | | 65B | 120 GB | 38.5 GB | From d7def1a7524f712e5ebb7cd02bab0f13aa56a7f9 Mon Sep 17 00:00:00 2001 From: Ronsor Date: Sat, 18 Mar 2023 17:10:47 -0700 Subject: [PATCH 16/20] Warn user if a context size greater than 2048 tokens is specified (#274) LLaMA doesn't support more than 2048 token context sizes, and going above that produces terrible results. --- main.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/main.cpp b/main.cpp index c88405b82..105dd91ee 100644 --- a/main.cpp +++ b/main.cpp @@ -792,6 +792,11 @@ int main(int argc, char ** argv) { if (gpt_params_parse(argc, argv, params) == false) { return 1; } + + if (params.n_ctx > 2048) { + fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" + "expect poor results\n", __func__, params.n_ctx); + } if (params.seed < 0) { params.seed = time(NULL); From 22213a17b56336bbea384a572a9484ce208c0333 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 19 Mar 2023 17:30:00 +0200 Subject: [PATCH 17/20] Change RMSNorm eps to 1e-6 (#173) I think this is what is used in the Python code --- ggml.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index 4fb83adbd..4813f74c8 100644 --- a/ggml.c +++ b/ggml.c @@ -5556,7 +5556,7 @@ static void ggml_compute_forward_rms_norm_f32( const size_t nb2 = dst->nb[2]; const size_t nb3 = dst->nb[3]; - const ggml_float eps = 1e-5f; // TODO: make this a parameter + const ggml_float eps = 1e-6f; // TODO: make this a parameter // TODO: optimize for (int i03 = 0; i03 < ne03; i03++) { @@ -5572,7 +5572,7 @@ static void ggml_compute_forward_rms_norm_f32( mean /= ne00; float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); - + memcpy(y, x, ne00 * sizeof(float)); // for (int i00 = 0; i00 < ne00; i00++) { // y[i00] = x[i00]; From 9e1707218a24ff758c7b623594f8c0ce5e12eb6c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 19 Mar 2023 18:37:02 +0200 Subject: [PATCH 18/20] Add "--instruct" argument for usage with Alpaca (#240) Also start adding prompts in "./prompts" --- main.cpp | 70 +++++++++++++++++++++++++-------------- prompts/alpaca.txt | 1 + prompts/chat-with-bob.txt | 7 ++++ utils.cpp | 18 ++++------ utils.h | 8 ++--- 5 files changed, 64 insertions(+), 40 deletions(-) create mode 100644 prompts/alpaca.txt create mode 100644 prompts/chat-with-bob.txt diff --git a/main.cpp b/main.cpp index 105dd91ee..a95e2e721 100644 --- a/main.cpp +++ b/main.cpp @@ -176,8 +176,6 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab } } - const ggml_type wtype2 = GGML_TYPE_F32; - auto & ctx = model.ctx; size_t ctx_size = 0; @@ -237,7 +235,6 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; model.layers.resize(n_layer); @@ -539,9 +536,7 @@ bool llama_eval( const int n_vocab = hparams.n_vocab; const int n_rot = hparams.n_embd/hparams.n_head; - const int d_key = n_embd/n_head; - - // TODO: check if this size scales with n_ctx linearly and remove constant. somehow I feel it wasn't the case + // TODO: check if this size scales with n_ctx linearly and remove constant. somehow I feel it wasn't the case // static size_t buf_size = hparams.n_ctx*1024*1024; static size_t buf_size = 512u*1024*1024; static void * buf = malloc(buf_size); @@ -792,7 +787,7 @@ int main(int argc, char ** argv) { if (gpt_params_parse(argc, argv, params) == false) { return 1; } - + if (params.n_ctx > 2048) { fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" "expect poor results\n", __func__, params.n_ctx); @@ -820,7 +815,7 @@ int main(int argc, char ** argv) { // load the model { const int64_t t_start_us = ggml_time_us(); - if (!llama_model_load(params.model, model, vocab, params.n_ctx)) { + if (!llama_model_load(params.model, model, vocab, params.n_ctx)) { fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); return 1; } @@ -849,9 +844,25 @@ int main(int argc, char ** argv) { params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); + // prefix & suffix for instruct mode + const std::vector inp_pfx = ::llama_tokenize(vocab, "\n\n### Instruction:\n\n", true); + const std::vector inp_sfx = ::llama_tokenize(vocab, "\n\n### Response:\n\n", false); + + // in instruct mode, we inject a prefix and a suffix to each input by the user + if (params.instruct) { + fprintf(stderr, "== Instruction mode enabled ==\n"); + params.interactive = true; + params.antiprompt = "### Instruction:\n\n"; + } + // tokenize the reverse prompt std::vector antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false); + // enable interactive mode if reverse prompt is specified + if (!antiprompt_inp.empty()) { + params.interactive = true; + } + fprintf(stderr, "\n"); fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); @@ -872,7 +883,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: interactive mode on.\n", __func__); - if(antiprompt_inp.size()) { + if (antiprompt_inp.size()) { fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str()); fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size()); for (int i = 0; i < (int) antiprompt_inp.size(); i++) { @@ -894,31 +905,27 @@ int main(int argc, char ** argv) { std::vector last_n_tokens(last_n_size); std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); - if (params.interactive) { fprintf(stderr, "== Running in interactive mode. ==\n" #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) " - Press Ctrl+C to interject at any time.\n" #endif " - Press Return to return control to LLaMa.\n" - " - If you want to submit another line, end your input in '\\'.\n"); + " - If you want to submit another line, end your input in '\\'.\n\n"); + is_interacting = true; } - int remaining_tokens = params.n_predict; int input_consumed = 0; bool input_noecho = false; - // prompt user immediately after the starting prompt has been loaded - if (params.interactive_start) { - is_interacting = true; - } + int remaining_tokens = params.n_predict; // set the color for the prompt which will be output initially if (params.use_color) { printf(ANSI_COLOR_YELLOW); } - while (remaining_tokens > 0) { + while (remaining_tokens > 0 || params.interactive) { // predict if (embd.size() > 0) { const int64_t t_start_us = ggml_time_us(); @@ -971,13 +978,13 @@ int main(int argc, char ** argv) { last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.push_back(embd_inp[input_consumed]); ++input_consumed; - if (embd.size() > params.n_batch) { + if ((int) embd.size() > params.n_batch) { break; } } // reset color to default if we there is no pending user input - if (!input_noecho && params.use_color && embd_inp.size() == input_consumed) { + if (!input_noecho && params.use_color && (int) embd_inp.size() == input_consumed) { printf(ANSI_COLOR_RESET); } } @@ -999,19 +1006,26 @@ int main(int argc, char ** argv) { is_interacting = true; } if (is_interacting) { + if (params.instruct) { + input_consumed = embd_inp.size(); + embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); + + printf("\n> "); + } + // currently being interactive - bool another_line=true; + bool another_line = true; while (another_line) { fflush(stdout); char buf[256] = {0}; int n_read; - if(params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN); + if (params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN); if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) { // presumable empty line, consume the newline std::ignore = scanf("%*c"); n_read=0; } - if(params.use_color) printf(ANSI_COLOR_RESET); + if (params.use_color) printf(ANSI_COLOR_RESET); if (n_read > 0 && buf[n_read-1]=='\\') { another_line = true; @@ -1026,6 +1040,10 @@ int main(int argc, char ** argv) { std::vector line_inp = ::llama_tokenize(vocab, buf, false); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); + if (params.instruct) { + embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); + } + remaining_tokens -= line_inp.size(); input_noecho = true; // do not echo this again @@ -1037,8 +1055,12 @@ int main(int argc, char ** argv) { // end of text token if (embd.back() == 2) { - fprintf(stderr, " [end of text]\n"); - break; + if (params.interactive) { + is_interacting = true; + } else { + fprintf(stderr, " [end of text]\n"); + break; + } } } diff --git a/prompts/alpaca.txt b/prompts/alpaca.txt new file mode 100644 index 000000000..2224bdeb0 --- /dev/null +++ b/prompts/alpaca.txt @@ -0,0 +1 @@ +Below is an instruction that describes a task. Write a response that appropriately completes the request. diff --git a/prompts/chat-with-bob.txt b/prompts/chat-with-bob.txt new file mode 100644 index 000000000..009da39ae --- /dev/null +++ b/prompts/chat-with-bob.txt @@ -0,0 +1,7 @@ +Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision. + +User: Hello, Bob. +Bob: Hello. How may I help you today? +User: Please tell me the largest city in Europe. +Bob: Sure. The largest city in Europe is Moscow, the capital of Russia. +User: diff --git a/utils.cpp b/utils.cpp index efa2e3c35..be81c6cd0 100644 --- a/utils.cpp +++ b/utils.cpp @@ -38,13 +38,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } else if (arg == "-p" || arg == "--prompt") { params.prompt = argv[++i]; } else if (arg == "-f" || arg == "--file") { - std::ifstream file(argv[++i]); - - std::copy(std::istreambuf_iterator(file), - std::istreambuf_iterator(), - back_inserter(params.prompt)); - + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); } else if (arg == "-n" || arg == "--n_predict") { params.n_predict = std::stoi(argv[++i]); } else if (arg == "--top_k") { @@ -65,9 +60,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.model = argv[++i]; } else if (arg == "-i" || arg == "--interactive") { params.interactive = true; - } else if (arg == "--interactive-start") { - params.interactive = true; - params.interactive_start = true; + } else if (arg == "-ins" || arg == "--instruct") { + params.instruct = true; } else if (arg == "--color") { params.use_color = true; } else if (arg == "-r" || arg == "--reverse-prompt") { @@ -85,13 +79,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { return true; } -void gpt_print_usage(int argc, char ** argv, const gpt_params & params) { +void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help show this help message and exit\n"); fprintf(stderr, " -i, --interactive run in interactive mode\n"); - fprintf(stderr, " --interactive-start run in interactive mode and poll user input at startup\n"); + fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n"); fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n"); fprintf(stderr, " in interactive mode, poll user input upon seeing PROMPT\n"); fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n"); @@ -398,7 +392,7 @@ gpt_vocab::id llama_sample_top_p_top_k( logits_id.push_back(std::make_pair(logits[i]*scale*repeat_penalty, i)); } else { logits_id.push_back(std::make_pair(logits[i]*scale/repeat_penalty, i)); - } + } } else { logits_id.push_back(std::make_pair(logits[i]*scale, i)); } diff --git a/utils.h b/utils.h index c1a8498a7..e329ba168 100644 --- a/utils.h +++ b/utils.h @@ -27,14 +27,14 @@ struct gpt_params { int32_t n_batch = 8; // batch size for prompt processing - std::string model = "models/lamma-7B/ggml-model.bin"; // model path - std::string prompt; + std::string model = "models/lamma-7B/ggml-model.bin"; // model path + std::string prompt = ""; + std::string antiprompt = ""; // string upon seeing which more user input is prompted bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode - bool interactive_start = false; // reverse prompt immediately - std::string antiprompt = ""; // string upon seeing which more user input is prompted + bool instruct = false; // instruction mode (used for Alpaca models) }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params); From a4e63b73dfa1894387926cc8072b5f36deebf0a5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 19 Mar 2023 18:49:50 +0200 Subject: [PATCH 19/20] Add instruction for using Alpaca (#240) --- README.md | 48 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 1fe5b5426..65325fc7f 100644 --- a/README.md +++ b/README.md @@ -176,21 +176,51 @@ In this mode, you can always interrupt generation by pressing Ctrl+C and enter o Here is an example few-shot interaction, invoked with the command ``` -./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" \ - -p \ -"Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision. - -User: Hello, Bob. -Bob: Hello. How may I help you today? -User: Please tell me the largest city in Europe. -Bob: Sure. The largest city in Europe is Moscow, the capital of Russia. -User:" +./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt ``` Note the use of `--color` to distinguish between user input and generated text. ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png) +### Instruction mode with Alpaca + +First, download the `ggml` Alpaca model into the `./models` folder: + +``` +# use one of these +# NOTE: these are copied from the alpaca.cpp repo - not sure how long these will work +# TODO: add a script to simplify the download +curl -o ggml-alpaca-7b-q4.bin -C - https://gateway.estuary.tech/gw/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC +curl -o ggml-alpaca-7b-q4.bin -C - https://ipfs.io/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC +curl -o ggml-alpaca-7b-q4.bin -C - https://cloudflare-ipfs.com/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC +``` + +Now run the `main` tool like this: + +``` +./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins +``` + +Sample run: + +``` +== Running in interactive mode. == + - Press Ctrl+C to interject at any time. + - Press Return to return control to LLaMa. + - If you want to submit another line, end your input in '\'. + + Below is an instruction that describes a task. Write a response that appropriately completes the request. + +> How many letters are there in the English alphabet? +There 26 letters in the English Alphabet +> What is the most common way of transportation in Amsterdam? +The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis +> List 5 words that start with "ca". +cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach. +> +``` + ### Android You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux). From 70f01cb8632f73b5cf70428608b89cd3c0775d23 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 19 Mar 2023 19:04:44 +0200 Subject: [PATCH 20/20] Drop trailing new line from file prompts (#80) --- main.cpp | 1 - utils.cpp | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/main.cpp b/main.cpp index a95e2e721..2a07bbb40 100644 --- a/main.cpp +++ b/main.cpp @@ -850,7 +850,6 @@ int main(int argc, char ** argv) { // in instruct mode, we inject a prefix and a suffix to each input by the user if (params.instruct) { - fprintf(stderr, "== Instruction mode enabled ==\n"); params.interactive = true; params.antiprompt = "### Instruction:\n\n"; } diff --git a/utils.cpp b/utils.cpp index be81c6cd0..320d7c31c 100644 --- a/utils.cpp +++ b/utils.cpp @@ -40,6 +40,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } else if (arg == "-f" || arg == "--file") { std::ifstream file(argv[++i]); std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); + if (params.prompt.back() == '\n') { + params.prompt.pop_back(); + } } else if (arg == "-n" || arg == "--n_predict") { params.n_predict = std::stoi(argv[++i]); } else if (arg == "--top_k") {