Merge branch 'master' into fix-cmake-pthread

This commit is contained in:
Sebastián A 2023-03-17 13:33:42 -03:00 committed by GitHub
commit c0f6681694
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 403 additions and 46 deletions

17
.devops/full.Dockerfile Normal file
View file

@ -0,0 +1,17 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip
RUN pip install --upgrade pip setuptools wheel \
&& pip install torch torchvision torchaudio sentencepiece numpy
WORKDIR /app
COPY . .
RUN make
ENTRYPOINT ["/app/.devops/tools.sh"]

18
.devops/main.Dockerfile Normal file
View file

@ -0,0 +1,18 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential
WORKDIR /app
COPY . .
RUN make
FROM ubuntu:$UBUNTU_VERSION as runtime
COPY --from=build /app/main /main
ENTRYPOINT [ "/main" ]

46
.devops/tools.sh Executable file
View file

@ -0,0 +1,46 @@
#!/bin/bash
set -e
# Read the first argument into a variable
arg1="$1"
# Shift the arguments to remove the first one
shift
# Join the remaining arguments into a single string
arg2="$@"
if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
python3 ./convert-pth-to-ggml.py $arg2
elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
./quantize $arg2
elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
./main $arg2
elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
python3 ./download-pth.py $arg2
elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
echo "Downloading model..."
python3 ./download-pth.py "$1" "$2"
echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do
if [ -f "${i/f16/q4_0}" ]; then
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
else
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
./quantize "$i" "${i/f16/q4_0}" 2
fi
done
else
echo "Unknown command: $arg1"
echo "Available commands: "
echo " --run (-r): Run a model previously converted into ggml"
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -t 8 -n 512"
echo " --convert (-c): Convert a llama model into ggml"
echo " ex: \"/models/7B/\" 1"
echo " --quantize (-q): Optimize with quantization process ggml"
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
echo " --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
echo " ex: \"/models/\" 7B"
echo " --all-in-one (-a): Execute --download, --convert & --quantize"
echo " ex: \"/models/\" 7B"
fi

24
.dockerignore Normal file
View file

@ -0,0 +1,24 @@
*.o
*.a
.cache/
.vs/
.vscode/
.DS_Store
build/
build-em/
build-debug/
build-release/
build-static/
build-no-accel/
build-sanitize-addr/
build-sanitize-thread/
models/*
/main
/quantize
arm_neon.h
compile_commands.json
Dockerfile

View file

@ -38,7 +38,7 @@ jobs:
cmake --build . --config Release
macOS-latest-make:
runs-on: macOS-latest
runs-on: macos-latest
steps:
- name: Clone

61
.github/workflows/docker.yml vendored Normal file
View file

@ -0,0 +1,61 @@
# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.
# GitHub recommends pinning actions to a commit SHA.
# To get a newer version, you will need to update the SHA.
# You can also reference a tag or branch, but the action may change without warning.
name: Publish Docker image
on:
pull_request:
push:
branches:
- master
jobs:
push_to_registry:
name: Push Docker image to Docker Hub
runs-on: ubuntu-latest
env:
COMMIT_SHA: ${{ github.sha }}
strategy:
matrix:
config:
- { tag: "light", dockerfile: ".devops/main.Dockerfile" }
- { tag: "full", dockerfile: ".devops/full.Dockerfile" }
steps:
- name: Check out the repo
uses: actions/checkout@v3
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Log in to Docker Hub
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push Docker image (versioned)
if: github.event_name == 'push'
uses: docker/build-push-action@v4
with:
context: .
push: true
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }}
- name: Build and push Docker image (tagged)
uses: docker/build-push-action@v4
with:
context: .
push: ${{ github.event_name == 'push' }}
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
file: ${{ matrix.config.dockerfile }}

View file

@ -7,6 +7,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
**Hot topics:**
- RMSNorm implementation / fixes: https://github.com/ggerganov/llama.cpp/issues/173
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
@ -31,6 +32,7 @@ Supported platforms:
- [X] Mac OS
- [X] Linux
- [X] Windows (via CMake)
- [X] Docker
---
@ -193,6 +195,37 @@ Finally, copy the `llama` binary and the model files to your device storage. Her
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
### Docker
#### Prerequisites
* Docker must be installed and running on your system.
* Create a folder to store big models & intermediate files (in ex. im using /llama/models)
#### Images
We have two Docker images available for this project:
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file.
#### Usage
The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
```bash
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
```
On complete, you are ready to play!
```bash
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
```
or with light image:
```bash
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
```
## Limitations
@ -206,8 +239,9 @@ https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b0
### Contributing
- Contributors can open PRs
- Collaborators can push to branches in the `llama.cpp` repo
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
- Collaborators will be invited based on contributions
- Any help with managing issues and PRs is very appreciated!
### Coding guidelines
@ -217,7 +251,3 @@ https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b0
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
- Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a`
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
### Misc
- Practice your C++ typing skills: https://typing-battles.ggerganov.com

View file

@ -16,7 +16,7 @@
# At the start of the ggml file we write the model parameters
# and vocabulary.
#
import os
import sys
import json
import struct
@ -64,6 +64,10 @@ if len(sys.argv) > 2:
sys.exit(1)
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
if os.path.exists(fname_out):
print(f"Skip conversion, it already exists: {fname_out}")
sys.exit(0)
with open(fname_hparams, "r") as f:
hparams = json.load(f)

66
download-pth.py Normal file
View file

@ -0,0 +1,66 @@
import os
import sys
from tqdm import tqdm
import requests
if len(sys.argv) < 3:
print("Usage: download-pth.py dir-model model-type\n")
print(" model-type: Available models 7B, 13B, 30B or 65B")
sys.exit(1)
modelsDir = sys.argv[1]
model = sys.argv[2]
num = {
"7B": 1,
"13B": 2,
"30B": 4,
"65B": 8,
}
if model not in num:
print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
sys.exit(1)
print(f"Downloading model {model}")
files = ["checklist.chk", "params.json"]
for i in range(num[model]):
files.append(f"consolidated.0{i}.pth")
resolved_path = os.path.abspath(os.path.join(modelsDir, model))
os.makedirs(resolved_path, exist_ok=True)
for file in files:
dest_path = os.path.join(resolved_path, file)
if os.path.exists(dest_path):
print(f"Skip file download, it already exists: {file}")
continue
url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
response = requests.get(url, stream=True)
with open(dest_path, 'wb') as f:
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
t.update(len(chunk))
files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
for file in files2:
dest_path = os.path.join(modelsDir, file)
if os.path.exists(dest_path):
print(f"Skip file download, it already exists: {file}")
continue
url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
response = requests.get(url, stream=True)
with open(dest_path, 'wb') as f:
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
t.update(len(chunk))

149
ggml.c
View file

@ -607,10 +607,11 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
assert(k % QK == 0);
const int nb = k / QK;
const size_t bs = 2*sizeof(float) + QK/2;
float * restrict pm = (float *) (y);
float * restrict pd = (float *) (pm + nb);
uint8_t * restrict pb = (uint8_t *) (pd + nb);
uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
uint8_t * restrict pm = ((uint8_t *)y + 0*bs + sizeof(float));
uint8_t * restrict pb = ((uint8_t *)y + 0*bs + 2*sizeof(float));
uint8_t pp[QK/2];
@ -627,8 +628,10 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
const float d = (max - min) / ((1 << 4) - 1);
const float id = d ? 1.0f/d : 0.0f;
pm[i] = min;
pd[i] = d;
*(float *)pm = min;
*(float *)pd = d;
pm += bs;
pd += bs;
for (int l = 0; l < QK; l += 2) {
const float v0 = (x[i*QK + l + 0] - min)*id;
@ -643,7 +646,8 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
pp[l/2] = vi0 | (vi1 << 4);
}
memcpy(pb + i*QK/2, pp, sizeof(pp));
memcpy(pb, pp, sizeof(pp));
pb += bs;
}
}
@ -687,16 +691,17 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) {
assert(k % QK == 0);
const int nb = k / QK;
const size_t bs = 2*sizeof(float) + QK/2;
const float * restrict pm = (const float *) (x);
const float * restrict pd = (const float *) (pm + nb);
const uint8_t * restrict pb = (const uint8_t *) (pd + nb);
const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float));
const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
for (int i = 0; i < nb; i++) {
const float m = pm[i];
const float d = pd[i];
const float d = *(const float *) (pd + i*bs);
const float m = *(const float *) (pm + i*bs);
const uint8_t * restrict pp = pb + i*QK/2;
const uint8_t * restrict pp = pb + i*bs;
for (int l = 0; l < QK; l += 2) {
const uint8_t vi = pp[l/2];
@ -1584,28 +1589,109 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
const int nb = n / QK;
const float * restrict pm0 = (const float *) x;
const float * restrict pm1 = (const float *) y;
const size_t bs = 2*sizeof(float) + QK/2;
const float * restrict pd0 = (const float *) (pm0 + nb);
const float * restrict pd1 = (const float *) (pm1 + nb);
const uint8_t * restrict pd0 = ((const uint8_t *)x + 0*bs);
const uint8_t * restrict pd1 = ((const uint8_t *)y + 0*bs);
const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb);
const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb);
const uint8_t * restrict pm0 = ((const uint8_t *)x + 0*bs + sizeof(float));
const uint8_t * restrict pm1 = ((const uint8_t *)y + 0*bs + sizeof(float));
const uint8_t * restrict pb0 = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
const uint8_t * restrict pb1 = ((const uint8_t *)y + 0*bs + 2*sizeof(float));
float sumf = 0.0;
#if 1
#if defined(__AVX2__)
#if QK == 32
// Initialize accumulator with zeros
__m256 acc = _mm256_setzero_ps();
// Accumulator for constant offsets
float acc_offset = 0.0f;
// Main loop
for (int i = 0; i < nb; ++i) {
const float * m0 = (const float *) (pm0 + i*bs);
const float * m1 = (const float *) (pm1 + i*bs);
const float * d0 = (const float *) (pd0 + i*bs);
const float * d1 = (const float *) (pd1 + i*bs);
const uint8_t * restrict p0 = pb0 + i*bs;
const uint8_t * restrict p1 = pb1 + i*bs;
const __m256 d0v = _mm256_broadcast_ss( d0 );
const __m256 d1v = _mm256_broadcast_ss( d1 );
const __m256 m0v = _mm256_broadcast_ss( m0 );
const __m256 m1v = _mm256_broadcast_ss( m1 );
// Compute combined scale for the block
const __m256 scale_01 = _mm256_mul_ps( d0v, d1v );
// Compute cross scales for the block
const __m256 scale_0 = _mm256_mul_ps( d0v, m1v );
const __m256 scale_1 = _mm256_mul_ps( m0v, d1v );
const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0b10101010 );
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
__m256i bx = bytesFromNibbles( p0 );
__m256i by = bytesFromNibbles( p1 );
// Now we have a vector with bytes in [ 0 .. 15 ] interval.
// Sign-extend first 16 signed bytes into int16_t
__m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) );
__m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) );
// Compute products of int16_t integers, add pairwise
__m256i i32 = _mm256_madd_epi16( x16, y16 );
// Sign-extend last 16 signed bytes into int16_t vectors
__m256i x16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) );
__m256i y16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) );
// Accumulate products of int16_t integers
i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16_h, y16_h ) );
// compute sums of unsigned bytes in bx, by in blocks of 8.
// This results in a layout like X100 0000 X200 0000 X300 0000 X400 0000,
// which we then interleave as X100 Y100 X200 Y200 X300 Y300 X400 Y400.
// so if we then cast to 8 singles, we get 8 floats like [ x0_7, y0_7, x8_15, y8_15, x16_23, y16_23, x24_31, y24_31 ]
__m256i xsumi = _mm256_sad_epu8( bx, _mm256_setzero_si256() );
__m256i ysumi = _mm256_sad_epu8( by, _mm256_setzero_si256() );
__m256i sumsi = _mm256_or_si256( xsumi, _mm256_slli_si256( ysumi, 4 ) );
__m256 sums = _mm256_cvtepi32_ps( sumsi );
// Convert int32_t to float
__m256 p = _mm256_cvtepi32_ps( i32 );
// Apply the scale, and accumulate
// acc += d0*d1*x*y + d0*m1*x + d1*m0*y
acc = _mm256_fmadd_ps( scale_01, p, acc );
acc = _mm256_fmadd_ps( cross_scales, sums, acc );
// acc_offset += m0*m1 (for each entry in the block)
acc_offset += (*m0)*(*m1);
}
// Return horizontal sum of the acc vector
__m128 res = _mm256_extractf128_ps( acc, 1 );
res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
sumf = _mm_cvtss_f32( res ) + acc_offset * QK;
#else
#error "not implemented for QK"
#endif
#else
// scalar
for (int i = 0; i < nb; i++) {
const float m0 = pm0[i];
const float m1 = pm1[i];
const float m0 = *(const float *) (pm0 + i*bs);
const float m1 = *(const float *) (pm1 + i*bs);
const float d0 = pd0[i];
const float d1 = pd1[i];
const float d0 = *(const float *) (pd0 + i*bs);
const float d1 = *(const float *) (pd1 + i*bs);
const uint8_t * restrict p0 = pb0 + i*QK/2;
const uint8_t * restrict p1 = pb1 + i*QK/2;
const uint8_t * restrict p0 = pb0 + i*bs;
const uint8_t * restrict p1 = pb1 + i*bs;
for (int j = 0; j < QK/2; j++) {
const uint8_t v0 = p0[j];
@ -1839,16 +1925,17 @@ inline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * res
assert(n % QK == 0);
const int nb = n / QK;
const size_t bs = 2*sizeof(float) + QK/2;
const float * restrict pm = (const float *) (x);
const float * restrict pd = (const float *) (pm + nb);
const uint8_t * restrict pb = (const uint8_t *) (pd + nb);
const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float));
const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
for (int i = 0; i < nb; i++) {
const float m = pm[i];
const float d = pd[i];
const float d = *(const float *) (pd + i*bs);
const float m = *(const float *) (pm + i*bs);
const uint8_t * restrict pp = pb + i*QK/2;
const uint8_t * restrict pp = pb + i*bs;
for (int l = 0; l < QK; l += 2) {
const uint8_t vi = pp[l/2];

View file

@ -489,7 +489,8 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
const int nb = k / qk;
const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2);
const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
const size_t row_size = nb*bs;
assert(k % qk == 0);
@ -498,10 +499,10 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
char * pdst = (char *) dst;
for (int j = 0; j < n; j += k) {
float * pm = (float *) (pdst + (j/k)*row_size);
float * pd = (float *) (pm + nb);
uint8_t * pb = (uint8_t *) (pd + nb);
for (int j = 0; j < n; j += k) {
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
//printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
@ -519,8 +520,10 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
const float d = (max - min) / ((1 << 4) - 1);
const float id = d ? 1.0f/d : 0.0f;
pm[i] = min;
pd[i] = d;
*(float *) pd = d;
*(float *) pm = min;
pd += bs;
pm += bs;
for (int l = 0; l < qk; l += 2) {
const float v0 = (src[j + i*qk + l + 0] - min)*id;
@ -538,7 +541,8 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
pp[l/2] = vi0 | (vi1 << 4);
}
memcpy(pb + i*qk/2, pp, pp_size);
memcpy(pb, pp, pp_size);
pb += bs;
}
}
}