Merge remote-tracking branch 'origin/master' into concedo
This commit is contained in:
commit
a19b5a4adc
17 changed files with 600 additions and 79 deletions
17
.devops/full.Dockerfile
Normal file
17
.devops/full.Dockerfile
Normal file
|
@ -0,0 +1,17 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential python3 python3-pip
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install torch torchvision torchaudio sentencepiece numpy
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN make
|
||||
|
||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
18
.devops/main.Dockerfile
Normal file
18
.devops/main.Dockerfile
Normal file
|
@ -0,0 +1,18 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN make
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||
|
||||
COPY --from=build /app/main /main
|
||||
|
||||
ENTRYPOINT [ "/main" ]
|
46
.devops/tools.sh
Executable file
46
.devops/tools.sh
Executable file
|
@ -0,0 +1,46 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Read the first argument into a variable
|
||||
arg1="$1"
|
||||
|
||||
# Shift the arguments to remove the first one
|
||||
shift
|
||||
|
||||
# Join the remaining arguments into a single string
|
||||
arg2="$@"
|
||||
|
||||
if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
|
||||
python3 ./convert-pth-to-ggml.py $arg2
|
||||
elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
|
||||
./quantize $arg2
|
||||
elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
|
||||
./main $arg2
|
||||
elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
|
||||
python3 ./download-pth.py $arg2
|
||||
elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
|
||||
echo "Downloading model..."
|
||||
python3 ./download-pth.py "$1" "$2"
|
||||
echo "Converting PTH to GGML..."
|
||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||
if [ -f "${i/f16/q4_0}" ]; then
|
||||
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
||||
else
|
||||
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
||||
./quantize "$i" "${i/f16/q4_0}" 2
|
||||
fi
|
||||
done
|
||||
else
|
||||
echo "Unknown command: $arg1"
|
||||
echo "Available commands: "
|
||||
echo " --run (-r): Run a model previously converted into ggml"
|
||||
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
||||
echo " --convert (-c): Convert a llama model into ggml"
|
||||
echo " ex: \"/models/7B/\" 1"
|
||||
echo " --quantize (-q): Optimize with quantization process ggml"
|
||||
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
||||
echo " --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
|
||||
echo " ex: \"/models/\" 7B"
|
||||
echo " --all-in-one (-a): Execute --download, --convert & --quantize"
|
||||
echo " ex: \"/models/\" 7B"
|
||||
fi
|
24
.dockerignore
Normal file
24
.dockerignore
Normal file
|
@ -0,0 +1,24 @@
|
|||
*.o
|
||||
*.a
|
||||
.cache/
|
||||
.vs/
|
||||
.vscode/
|
||||
.DS_Store
|
||||
|
||||
build/
|
||||
build-em/
|
||||
build-debug/
|
||||
build-release/
|
||||
build-static/
|
||||
build-no-accel/
|
||||
build-sanitize-addr/
|
||||
build-sanitize-thread/
|
||||
|
||||
models/*
|
||||
|
||||
/main
|
||||
/quantize
|
||||
|
||||
arm_neon.h
|
||||
compile_commands.json
|
||||
Dockerfile
|
45
.github/workflows/build.yml
vendored
45
.github/workflows/build.yml
vendored
|
@ -2,7 +2,7 @@ name: CI
|
|||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
ubuntu-latest:
|
||||
ubuntu-latest-make:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
|
@ -18,8 +18,27 @@ jobs:
|
|||
run: |
|
||||
make
|
||||
|
||||
macOS-latest:
|
||||
runs-on: macOS-latest
|
||||
ubuntu-latest-cmake:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
cmake --build . --config Release
|
||||
|
||||
macOS-latest-make:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
@ -33,7 +52,25 @@ jobs:
|
|||
run: |
|
||||
make
|
||||
|
||||
windows-latest:
|
||||
macOS-latest-cmake:
|
||||
runs-on: macOS-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
brew update
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
cmake --build . --config Release
|
||||
|
||||
windows-latest-cmake:
|
||||
runs-on: windows-latest
|
||||
|
||||
steps:
|
||||
|
|
61
.github/workflows/docker.yml
vendored
Normal file
61
.github/workflows/docker.yml
vendored
Normal file
|
@ -0,0 +1,61 @@
|
|||
# This workflow uses actions that are not certified by GitHub.
|
||||
# They are provided by a third-party and are governed by
|
||||
# separate terms of service, privacy policy, and support
|
||||
# documentation.
|
||||
|
||||
# GitHub recommends pinning actions to a commit SHA.
|
||||
# To get a newer version, you will need to update the SHA.
|
||||
# You can also reference a tag or branch, but the action may change without warning.
|
||||
|
||||
name: Publish Docker image
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
jobs:
|
||||
push_to_registry:
|
||||
name: Push Docker image to Docker Hub
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.sha }}
|
||||
strategy:
|
||||
matrix:
|
||||
config:
|
||||
- { tag: "light", dockerfile: ".devops/main.Dockerfile" }
|
||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile" }
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Build and push Docker image (versioned)
|
||||
if: github.event_name == 'push'
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
|
||||
- name: Build and push Docker image (tagged)
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
push: ${{ github.event_name == 'push' }}
|
||||
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
|
||||
file: ${{ matrix.config.dockerfile }}
|
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -18,6 +18,10 @@ models/*
|
|||
|
||||
/main
|
||||
/quantize
|
||||
/result
|
||||
|
||||
arm_neon.h
|
||||
compile_commands.json
|
||||
|
||||
.envrc
|
||||
.direnv/
|
||||
|
|
|
@ -4,6 +4,8 @@ project("llama.cpp")
|
|||
set(CMAKE_CXX_STANDARD 20)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
||||
set(CMAKE_C_STANDARD 11)
|
||||
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
||||
|
@ -126,3 +128,4 @@ target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS})
|
|||
target_include_directories(ggml PUBLIC .)
|
||||
target_link_libraries(quantize PRIVATE ggml)
|
||||
target_link_libraries(llama PRIVATE ggml)
|
||||
target_link_libraries(ggml PRIVATE Threads::Threads)
|
||||
|
|
39
README.md
39
README.md
|
@ -32,13 +32,14 @@ Supported platforms:
|
|||
- [X] Mac OS
|
||||
- [X] Linux
|
||||
- [X] Windows (via CMake)
|
||||
- [X] Docker
|
||||
|
||||
---
|
||||
|
||||
Here is a typical run using LLaMA-7B:
|
||||
|
||||
```java
|
||||
make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
|
||||
make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
I llama.cpp build info:
|
||||
I UNAME_S: Darwin
|
||||
I UNAME_P: arm
|
||||
|
@ -149,7 +150,7 @@ python3 convert-pth-to-ggml.py models/7B/ 1
|
|||
./quantize.sh 7B
|
||||
|
||||
# run the inference
|
||||
./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128
|
||||
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
|
||||
```
|
||||
|
||||
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
||||
|
@ -163,7 +164,7 @@ In this mode, you can always interrupt generation by pressing Ctrl+C and enter o
|
|||
|
||||
Here is an example few-shot interaction, invoked with the command
|
||||
```
|
||||
./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
|
||||
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
|
||||
-p \
|
||||
"Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
||||
|
||||
|
@ -194,6 +195,37 @@ Finally, copy the `llama` binary and the model files to your device storage. Her
|
|||
|
||||
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
|
||||
|
||||
### Docker
|
||||
|
||||
#### Prerequisites
|
||||
* Docker must be installed and running on your system.
|
||||
* Create a folder to store big models & intermediate files (in ex. im using /llama/models)
|
||||
|
||||
#### Images
|
||||
We have two Docker images available for this project:
|
||||
|
||||
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file.
|
||||
|
||||
#### Usage
|
||||
|
||||
The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
|
||||
|
||||
```bash
|
||||
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
|
||||
```
|
||||
|
||||
On complete, you are ready to play!
|
||||
|
||||
```bash
|
||||
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
```
|
||||
|
||||
or with light image:
|
||||
|
||||
```bash
|
||||
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
```
|
||||
|
||||
## Limitations
|
||||
|
||||
|
@ -210,6 +242,7 @@ https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b0
|
|||
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
|
||||
- Collaborators will be invited based on contributions
|
||||
- Any help with managing issues and PRs is very appreciated!
|
||||
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
|
||||
|
||||
### Coding guidelines
|
||||
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
# At the start of the ggml file we write the model parameters
|
||||
# and vocabulary.
|
||||
#
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import struct
|
||||
|
@ -64,6 +64,10 @@ if len(sys.argv) > 2:
|
|||
sys.exit(1)
|
||||
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
|
||||
|
||||
if os.path.exists(fname_out):
|
||||
print(f"Skip conversion, it already exists: {fname_out}")
|
||||
sys.exit(0)
|
||||
|
||||
with open(fname_hparams, "r") as f:
|
||||
hparams = json.load(f)
|
||||
|
||||
|
|
66
download-pth.py
Normal file
66
download-pth.py
Normal file
|
@ -0,0 +1,66 @@
|
|||
import os
|
||||
import sys
|
||||
from tqdm import tqdm
|
||||
import requests
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: download-pth.py dir-model model-type\n")
|
||||
print(" model-type: Available models 7B, 13B, 30B or 65B")
|
||||
sys.exit(1)
|
||||
|
||||
modelsDir = sys.argv[1]
|
||||
model = sys.argv[2]
|
||||
|
||||
num = {
|
||||
"7B": 1,
|
||||
"13B": 2,
|
||||
"30B": 4,
|
||||
"65B": 8,
|
||||
}
|
||||
|
||||
if model not in num:
|
||||
print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Downloading model {model}")
|
||||
|
||||
files = ["checklist.chk", "params.json"]
|
||||
|
||||
for i in range(num[model]):
|
||||
files.append(f"consolidated.0{i}.pth")
|
||||
|
||||
resolved_path = os.path.abspath(os.path.join(modelsDir, model))
|
||||
os.makedirs(resolved_path, exist_ok=True)
|
||||
|
||||
for file in files:
|
||||
dest_path = os.path.join(resolved_path, file)
|
||||
|
||||
if os.path.exists(dest_path):
|
||||
print(f"Skip file download, it already exists: {file}")
|
||||
continue
|
||||
|
||||
url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
|
||||
response = requests.get(url, stream=True)
|
||||
with open(dest_path, 'wb') as f:
|
||||
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
|
||||
for chunk in response.iter_content(chunk_size=1024):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
t.update(len(chunk))
|
||||
|
||||
files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
|
||||
for file in files2:
|
||||
dest_path = os.path.join(modelsDir, file)
|
||||
|
||||
if os.path.exists(dest_path):
|
||||
print(f"Skip file download, it already exists: {file}")
|
||||
continue
|
||||
|
||||
url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
|
||||
response = requests.get(url, stream=True)
|
||||
with open(dest_path, 'wb') as f:
|
||||
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
|
||||
for chunk in response.iter_content(chunk_size=1024):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
t.update(len(chunk))
|
43
flake.lock
generated
Normal file
43
flake.lock
generated
Normal file
|
@ -0,0 +1,43 @@
|
|||
{
|
||||
"nodes": {
|
||||
"flake-utils": {
|
||||
"locked": {
|
||||
"lastModified": 1676283394,
|
||||
"narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1678470307,
|
||||
"narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"flake-utils": "flake-utils",
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
48
flake.nix
Normal file
48
flake.nix
Normal file
|
@ -0,0 +1,48 @@
|
|||
{
|
||||
inputs = {
|
||||
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
|
||||
flake-utils.url = "github:numtide/flake-utils";
|
||||
};
|
||||
outputs = { self, nixpkgs, flake-utils }:
|
||||
flake-utils.lib.eachDefaultSystem (system:
|
||||
let
|
||||
pkgs = import nixpkgs {
|
||||
inherit system;
|
||||
};
|
||||
llama-python = pkgs.python310.withPackages (ps: with ps; [
|
||||
torch
|
||||
numpy
|
||||
sentencepiece
|
||||
]);
|
||||
in
|
||||
{
|
||||
packages.default = pkgs.stdenv.mkDerivation {
|
||||
name = "llama.cpp";
|
||||
src = ./.;
|
||||
nativeBuildInputs = with pkgs; [ cmake ];
|
||||
buildInputs = with pkgs; lib.optionals stdenv.isDarwin [
|
||||
darwin.apple_sdk.frameworks.Accelerate
|
||||
];
|
||||
cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [
|
||||
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
||||
];
|
||||
installPhase = ''
|
||||
mkdir -p $out/bin
|
||||
mv llama $out/bin/llama
|
||||
mv quantize $out/bin/quantize
|
||||
echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
|
||||
cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
|
||||
chmod +x $out/bin/convert-pth-to-ggml
|
||||
'';
|
||||
};
|
||||
devShells.default = pkgs.mkShell {
|
||||
packages = with pkgs; [
|
||||
cmake
|
||||
llama-python
|
||||
] ++ lib.optionals stdenv.isDarwin [
|
||||
darwin.apple_sdk.frameworks.Accelerate
|
||||
];
|
||||
};
|
||||
}
|
||||
);
|
||||
}
|
153
ggml.c
153
ggml.c
|
@ -607,10 +607,11 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
|
|||
assert(k % QK == 0);
|
||||
|
||||
const int nb = k / QK;
|
||||
const size_t bs = 2*sizeof(float) + QK/2;
|
||||
|
||||
float * restrict pm = (float *) (y);
|
||||
float * restrict pd = (float *) (pm + nb);
|
||||
uint8_t * restrict pb = (uint8_t *) (pd + nb);
|
||||
uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
|
||||
uint8_t * restrict pm = ((uint8_t *)y + 0*bs + sizeof(float));
|
||||
uint8_t * restrict pb = ((uint8_t *)y + 0*bs + 2*sizeof(float));
|
||||
|
||||
uint8_t pp[QK/2];
|
||||
|
||||
|
@ -627,8 +628,10 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
|
|||
const float d = (max - min) / ((1 << 4) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
pm[i] = min;
|
||||
pd[i] = d;
|
||||
*(float *)pm = min;
|
||||
*(float *)pd = d;
|
||||
pm += bs;
|
||||
pd += bs;
|
||||
|
||||
for (int l = 0; l < QK; l += 2) {
|
||||
const float v0 = (x[i*QK + l + 0] - min)*id;
|
||||
|
@ -643,7 +646,8 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
|
|||
pp[l/2] = vi0 | (vi1 << 4);
|
||||
}
|
||||
|
||||
memcpy(pb + i*QK/2, pp, sizeof(pp));
|
||||
memcpy(pb, pp, sizeof(pp));
|
||||
pb += bs;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -687,16 +691,17 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) {
|
|||
assert(k % QK == 0);
|
||||
|
||||
const int nb = k / QK;
|
||||
const size_t bs = 2*sizeof(float) + QK/2;
|
||||
|
||||
const float * restrict pm = (const float *) (x);
|
||||
const float * restrict pd = (const float *) (pm + nb);
|
||||
const uint8_t * restrict pb = (const uint8_t *) (pd + nb);
|
||||
const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
|
||||
const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float));
|
||||
const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float m = pm[i];
|
||||
const float d = pd[i];
|
||||
const float d = *(const float *) (pd + i*bs);
|
||||
const float m = *(const float *) (pm + i*bs);
|
||||
|
||||
const uint8_t * restrict pp = pb + i*QK/2;
|
||||
const uint8_t * restrict pp = pb + i*bs;
|
||||
|
||||
for (int l = 0; l < QK; l += 2) {
|
||||
const uint8_t vi = pp[l/2];
|
||||
|
@ -1584,28 +1589,109 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
|
|||
inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
|
||||
const int nb = n / QK;
|
||||
|
||||
const float * restrict pm0 = (const float *) x;
|
||||
const float * restrict pm1 = (const float *) y;
|
||||
const size_t bs = 2*sizeof(float) + QK/2;
|
||||
|
||||
const float * restrict pd0 = (const float *) (pm0 + nb);
|
||||
const float * restrict pd1 = (const float *) (pm1 + nb);
|
||||
const uint8_t * restrict pd0 = ((const uint8_t *)x + 0*bs);
|
||||
const uint8_t * restrict pd1 = ((const uint8_t *)y + 0*bs);
|
||||
|
||||
const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb);
|
||||
const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb);
|
||||
const uint8_t * restrict pm0 = ((const uint8_t *)x + 0*bs + sizeof(float));
|
||||
const uint8_t * restrict pm1 = ((const uint8_t *)y + 0*bs + sizeof(float));
|
||||
|
||||
const uint8_t * restrict pb0 = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
|
||||
const uint8_t * restrict pb1 = ((const uint8_t *)y + 0*bs + 2*sizeof(float));
|
||||
|
||||
float sumf = 0.0;
|
||||
|
||||
#if 1
|
||||
#if defined(__AVX2__)
|
||||
#if QK == 32
|
||||
// Initialize accumulator with zeros
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
// Accumulator for constant offsets
|
||||
float acc_offset = 0.0f;
|
||||
|
||||
// Main loop
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float * m0 = (const float *) (pm0 + i*bs);
|
||||
const float * m1 = (const float *) (pm1 + i*bs);
|
||||
|
||||
const float * d0 = (const float *) (pd0 + i*bs);
|
||||
const float * d1 = (const float *) (pd1 + i*bs);
|
||||
|
||||
const uint8_t * restrict p0 = pb0 + i*bs;
|
||||
const uint8_t * restrict p1 = pb1 + i*bs;
|
||||
|
||||
const __m256 d0v = _mm256_broadcast_ss( d0 );
|
||||
const __m256 d1v = _mm256_broadcast_ss( d1 );
|
||||
const __m256 m0v = _mm256_broadcast_ss( m0 );
|
||||
const __m256 m1v = _mm256_broadcast_ss( m1 );
|
||||
|
||||
|
||||
// Compute combined scale for the block
|
||||
const __m256 scale_01 = _mm256_mul_ps( d0v, d1v );
|
||||
|
||||
// Compute cross scales for the block
|
||||
const __m256 scale_0 = _mm256_mul_ps( d0v, m1v );
|
||||
const __m256 scale_1 = _mm256_mul_ps( m0v, d1v );
|
||||
const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0b10101010 );
|
||||
|
||||
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
|
||||
__m256i bx = bytesFromNibbles( p0 );
|
||||
__m256i by = bytesFromNibbles( p1 );
|
||||
|
||||
// Now we have a vector with bytes in [ 0 .. 15 ] interval.
|
||||
|
||||
// Sign-extend first 16 signed bytes into int16_t
|
||||
__m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) );
|
||||
__m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) );
|
||||
// Compute products of int16_t integers, add pairwise
|
||||
__m256i i32 = _mm256_madd_epi16( x16, y16 );
|
||||
|
||||
// Sign-extend last 16 signed bytes into int16_t vectors
|
||||
__m256i x16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) );
|
||||
__m256i y16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) );
|
||||
// Accumulate products of int16_t integers
|
||||
i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16_h, y16_h ) );
|
||||
|
||||
// compute sums of unsigned bytes in bx, by in blocks of 8.
|
||||
// This results in a layout like X100 0000 X200 0000 X300 0000 X400 0000,
|
||||
// which we then interleave as X100 Y100 X200 Y200 X300 Y300 X400 Y400.
|
||||
// so if we then cast to 8 singles, we get 8 floats like [ x0_7, y0_7, x8_15, y8_15, x16_23, y16_23, x24_31, y24_31 ]
|
||||
__m256i xsumi = _mm256_sad_epu8( bx, _mm256_setzero_si256() );
|
||||
__m256i ysumi = _mm256_sad_epu8( by, _mm256_setzero_si256() );
|
||||
__m256i sumsi = _mm256_or_si256( xsumi, _mm256_slli_si256( ysumi, 4 ) );
|
||||
__m256 sums = _mm256_cvtepi32_ps( sumsi );
|
||||
|
||||
// Convert int32_t to float
|
||||
__m256 p = _mm256_cvtepi32_ps( i32 );
|
||||
// Apply the scale, and accumulate
|
||||
// acc += d0*d1*x*y + d0*m1*x + d1*m0*y
|
||||
acc = _mm256_fmadd_ps( scale_01, p, acc );
|
||||
acc = _mm256_fmadd_ps( cross_scales, sums, acc );
|
||||
// acc_offset += m0*m1 (for each entry in the block)
|
||||
acc_offset += (*m0)*(*m1);
|
||||
}
|
||||
|
||||
// Return horizontal sum of the acc vector
|
||||
__m128 res = _mm256_extractf128_ps( acc, 1 );
|
||||
res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
|
||||
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
|
||||
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
|
||||
|
||||
sumf = _mm_cvtss_f32( res ) + acc_offset * QK;
|
||||
#else
|
||||
#error "not implemented for QK"
|
||||
#endif
|
||||
#else
|
||||
// scalar
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float m0 = pm0[i];
|
||||
const float m1 = pm1[i];
|
||||
const float m0 = *(const float *) (pm0 + i*bs);
|
||||
const float m1 = *(const float *) (pm1 + i*bs);
|
||||
|
||||
const float d0 = pd0[i];
|
||||
const float d1 = pd1[i];
|
||||
const float d0 = *(const float *) (pd0 + i*bs);
|
||||
const float d1 = *(const float *) (pd1 + i*bs);
|
||||
|
||||
const uint8_t * restrict p0 = pb0 + i*QK/2;
|
||||
const uint8_t * restrict p1 = pb1 + i*QK/2;
|
||||
const uint8_t * restrict p0 = pb0 + i*bs;
|
||||
const uint8_t * restrict p1 = pb1 + i*bs;
|
||||
|
||||
for (int j = 0; j < QK/2; j++) {
|
||||
const uint8_t v0 = p0[j];
|
||||
|
@ -1839,16 +1925,17 @@ inline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * res
|
|||
assert(n % QK == 0);
|
||||
|
||||
const int nb = n / QK;
|
||||
const size_t bs = 2*sizeof(float) + QK/2;
|
||||
|
||||
const float * restrict pm = (const float *) (x);
|
||||
const float * restrict pd = (const float *) (pm + nb);
|
||||
const uint8_t * restrict pb = (const uint8_t *) (pd + nb);
|
||||
const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
|
||||
const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float));
|
||||
const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float m = pm[i];
|
||||
const float d = pd[i];
|
||||
const float d = *(const float *) (pd + i*bs);
|
||||
const float m = *(const float *) (pm + i*bs);
|
||||
|
||||
const uint8_t * restrict pp = pb + i*QK/2;
|
||||
const uint8_t * restrict pp = pb + i*bs;
|
||||
|
||||
for (int l = 0; l < QK; l += 2) {
|
||||
const uint8_t vi = pp[l/2];
|
||||
|
@ -9231,10 +9318,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
}
|
||||
|
||||
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
|
||||
if (cgraph->n_threads <= 0) {
|
||||
cgraph->n_threads = 8;
|
||||
}
|
||||
|
||||
const int n_threads = cgraph->n_threads;
|
||||
|
||||
struct ggml_compute_state_shared state_shared = {
|
||||
|
|
2
main.cpp
2
main.cpp
|
@ -846,6 +846,8 @@ int main(int argc, char ** argv) {
|
|||
|
||||
std::vector<float> logits;
|
||||
|
||||
// Add a space in front of the first character to match OG llama tokenizer behavior
|
||||
params.prompt.insert(0, 1, ' ');
|
||||
// tokenize the prompt
|
||||
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
|
||||
|
||||
|
|
102
utils.cpp
102
utils.cpp
|
@ -16,6 +16,18 @@
|
|||
#endif
|
||||
|
||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
// determine sensible default number of threads.
|
||||
// std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
|
||||
#ifdef __linux__
|
||||
std::ifstream cpuinfo("/proc/cpuinfo");
|
||||
params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
|
||||
std::istream_iterator<std::string>(),
|
||||
std::string("processor"));
|
||||
#endif
|
||||
if (params.n_threads == 0) {
|
||||
params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
std::string arg = argv[i];
|
||||
|
||||
|
@ -275,40 +287,56 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
|
|||
return tokens;
|
||||
}
|
||||
|
||||
// TODO: Calculate this constant from the vocabulary
|
||||
#define MAX_TOKEN_LEN 18
|
||||
// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
|
||||
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
|
||||
//auto res = gpt_tokenize(vocab, text);
|
||||
|
||||
//if (bos) {
|
||||
// res.insert(res.begin(), 1); // TODO: replace with vocab.bos
|
||||
//}
|
||||
|
||||
std::vector<gpt_vocab::id> res;
|
||||
std::vector<int> score;
|
||||
std::vector<gpt_vocab::id> prev;
|
||||
int len = text.length();
|
||||
|
||||
score.resize(len + 1);
|
||||
prev.resize(len + 1);
|
||||
|
||||
// Forward pass
|
||||
for (int i = 0; i < len; i++) {
|
||||
int max_len = std::min(len - i, MAX_TOKEN_LEN);
|
||||
for (int sub_len = 1; sub_len <= len - i; sub_len++) {
|
||||
auto sub = text.substr(i, sub_len);
|
||||
auto token = vocab.token_to_id.find(sub);
|
||||
if (token != vocab.token_to_id.end()) {
|
||||
int token_score = sub.length() * sub.length();
|
||||
int local_score = score[i] + token_score;
|
||||
int next = i + sub_len;
|
||||
if (score[next] < local_score) {
|
||||
score[next] = local_score;
|
||||
prev[next] = (*token).second;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Backward pass
|
||||
int i = len;
|
||||
while (i > 0) {
|
||||
gpt_vocab::id token_id = prev[i];
|
||||
if (token_id == 0) {
|
||||
// TODO: Return error or something more meaningful
|
||||
printf("failed to tokenize string!\n");
|
||||
break;
|
||||
}
|
||||
res.push_back(token_id);
|
||||
auto token = (*vocab.id_to_token.find(token_id)).second;
|
||||
i -= token.length();
|
||||
}
|
||||
|
||||
if (bos) {
|
||||
res.push_back(1); // TODO: replace with vocab.bos
|
||||
}
|
||||
|
||||
//find the longest token that matches the text
|
||||
int pos = 0;
|
||||
while (true) {
|
||||
int l = 0;
|
||||
int t = 0;
|
||||
for (const auto & kv : vocab.id_to_token) {
|
||||
if (kv.second.size() < l) continue;
|
||||
if (kv.second.size() > text.size() - pos) continue;
|
||||
if (text.substr(pos, kv.second.size()) == kv.second) {
|
||||
l = kv.second.size();
|
||||
t = kv.first;
|
||||
}
|
||||
}
|
||||
|
||||
if (l == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
res.push_back(t);
|
||||
pos += l;
|
||||
}
|
||||
// Pieces are in reverse order so correct that
|
||||
std::reverse(res.begin(), res.end());
|
||||
|
||||
return res;
|
||||
}
|
||||
|
@ -489,7 +517,8 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t
|
|||
|
||||
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
|
||||
const int nb = k / qk;
|
||||
const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2);
|
||||
const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
|
||||
const size_t row_size = nb*bs;
|
||||
|
||||
assert(k % qk == 0);
|
||||
|
||||
|
@ -498,10 +527,10 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
|
|||
|
||||
char * pdst = (char *) dst;
|
||||
|
||||
for (int j = 0; j < n; j += k) {
|
||||
float * pm = (float *) (pdst + (j/k)*row_size);
|
||||
float * pd = (float *) (pm + nb);
|
||||
uint8_t * pb = (uint8_t *) (pd + nb);
|
||||
for (int j = 0; j < n; j += k) {
|
||||
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
|
||||
uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
|
||||
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
|
||||
|
||||
//printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
|
||||
|
||||
|
@ -519,8 +548,10 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
|
|||
const float d = (max - min) / ((1 << 4) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
pm[i] = min;
|
||||
pd[i] = d;
|
||||
*(float *) pd = d;
|
||||
*(float *) pm = min;
|
||||
pd += bs;
|
||||
pm += bs;
|
||||
|
||||
for (int l = 0; l < qk; l += 2) {
|
||||
const float v0 = (src[j + i*qk + l + 0] - min)*id;
|
||||
|
@ -538,7 +569,8 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
|
|||
pp[l/2] = vi0 | (vi1 << 4);
|
||||
}
|
||||
|
||||
memcpy(pb + i*qk/2, pp, pp_size);
|
||||
memcpy(pb, pp, pp_size);
|
||||
pb += bs;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
2
utils.h
2
utils.h
|
@ -18,7 +18,7 @@ struct gpt_params {
|
|||
int32_t n_predict = 128; // new tokens to predict
|
||||
int32_t repeat_last_n = 64; // last n tokens to penalize
|
||||
int32_t n_ctx = 512; //context size
|
||||
|
||||
|
||||
// sampling parameters
|
||||
int32_t top_k = 40;
|
||||
float top_p = 0.95f;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue