Merge branch 'master' into optimize-convert
This commit is contained in:
commit
a44ccef6ac
17 changed files with 522 additions and 42 deletions
17
.devops/full.Dockerfile
Normal file
17
.devops/full.Dockerfile
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential python3 python3-pip
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install torch torchvision torchaudio sentencepiece numpy
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
18
.devops/main.Dockerfile
Normal file
18
.devops/main.Dockerfile
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/main /main
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/main" ]
|
46
.devops/tools.sh
Executable file
46
.devops/tools.sh
Executable file
|
@ -0,0 +1,46 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Read the first argument into a variable
|
||||||
|
arg1="$1"
|
||||||
|
|
||||||
|
# Shift the arguments to remove the first one
|
||||||
|
shift
|
||||||
|
|
||||||
|
# Join the remaining arguments into a single string
|
||||||
|
arg2="$@"
|
||||||
|
|
||||||
|
if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
|
||||||
|
python3 ./convert-pth-to-ggml.py $arg2
|
||||||
|
elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
|
||||||
|
./quantize $arg2
|
||||||
|
elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
|
||||||
|
./main $arg2
|
||||||
|
elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
|
||||||
|
python3 ./download-pth.py $arg2
|
||||||
|
elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
|
||||||
|
echo "Downloading model..."
|
||||||
|
python3 ./download-pth.py "$1" "$2"
|
||||||
|
echo "Converting PTH to GGML..."
|
||||||
|
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||||
|
if [ -f "${i/f16/q4_0}" ]; then
|
||||||
|
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
||||||
|
else
|
||||||
|
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
||||||
|
./quantize "$i" "${i/f16/q4_0}" 2
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
else
|
||||||
|
echo "Unknown command: $arg1"
|
||||||
|
echo "Available commands: "
|
||||||
|
echo " --run (-r): Run a model previously converted into ggml"
|
||||||
|
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
||||||
|
echo " --convert (-c): Convert a llama model into ggml"
|
||||||
|
echo " ex: \"/models/7B/\" 1"
|
||||||
|
echo " --quantize (-q): Optimize with quantization process ggml"
|
||||||
|
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
||||||
|
echo " --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
|
||||||
|
echo " ex: \"/models/\" 7B"
|
||||||
|
echo " --all-in-one (-a): Execute --download, --convert & --quantize"
|
||||||
|
echo " ex: \"/models/\" 7B"
|
||||||
|
fi
|
24
.dockerignore
Normal file
24
.dockerignore
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
*.o
|
||||||
|
*.a
|
||||||
|
.cache/
|
||||||
|
.vs/
|
||||||
|
.vscode/
|
||||||
|
.DS_Store
|
||||||
|
|
||||||
|
build/
|
||||||
|
build-em/
|
||||||
|
build-debug/
|
||||||
|
build-release/
|
||||||
|
build-static/
|
||||||
|
build-no-accel/
|
||||||
|
build-sanitize-addr/
|
||||||
|
build-sanitize-thread/
|
||||||
|
|
||||||
|
models/*
|
||||||
|
|
||||||
|
/main
|
||||||
|
/quantize
|
||||||
|
|
||||||
|
arm_neon.h
|
||||||
|
compile_commands.json
|
||||||
|
Dockerfile
|
102
.github/workflows/build.yml
vendored
102
.github/workflows/build.yml
vendored
|
@ -1,8 +1,42 @@
|
||||||
name: CI
|
name: CI
|
||||||
on: [push, pull_request]
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
inputs:
|
||||||
|
create_release:
|
||||||
|
description: 'Create new release'
|
||||||
|
required: true
|
||||||
|
type: boolean
|
||||||
|
push:
|
||||||
|
paths: ['.github/workflows/**', 'CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
|
||||||
|
paths: ['CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
|
||||||
|
|
||||||
|
env:
|
||||||
|
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
ubuntu-latest:
|
ubuntu-latest-make:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v1
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install build-essential
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: make_build
|
||||||
|
run: |
|
||||||
|
make
|
||||||
|
|
||||||
|
ubuntu-latest-cmake:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
@ -15,10 +49,31 @@ jobs:
|
||||||
sudo apt-get install build-essential
|
sudo apt-get install build-essential
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
|
run: |
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake ..
|
||||||
|
cmake --build . --config Release
|
||||||
|
|
||||||
|
macOS-latest-make:
|
||||||
|
runs-on: macos-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v1
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
brew update
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: make_build
|
||||||
run: |
|
run: |
|
||||||
make
|
make
|
||||||
|
|
||||||
macOS-latest:
|
macOS-latest-cmake:
|
||||||
runs-on: macOS-latest
|
runs-on: macOS-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
@ -31,22 +86,59 @@ jobs:
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
run: |
|
run: |
|
||||||
make
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake ..
|
||||||
|
cmake --build . --config Release
|
||||||
|
|
||||||
windows-latest:
|
windows-latest-cmake:
|
||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v1
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake ..
|
cmake ..
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
|
|
||||||
|
- name: Get commit hash
|
||||||
|
id: commit
|
||||||
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
uses: pr-mpt/actions-commit-hash@v2
|
||||||
|
|
||||||
|
- name: Pack artifacts
|
||||||
|
id: pack_artifacts
|
||||||
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
run: |
|
||||||
|
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip .\build\Release\*
|
||||||
|
|
||||||
|
- name: Create release
|
||||||
|
id: create_release
|
||||||
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
uses: zendesk/action-create-release@v1
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
with:
|
||||||
|
tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
|
||||||
|
|
||||||
|
- name: Upload release
|
||||||
|
id: upload_release
|
||||||
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
uses: actions/upload-release-asset@v1
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
with:
|
||||||
|
upload_url: ${{ steps.create_release.outputs.upload_url }}
|
||||||
|
asset_path: .\llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
|
||||||
|
asset_name: llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
|
||||||
|
asset_content_type: application/octet-stream
|
||||||
|
|
||||||
# ubuntu-latest-gcc:
|
# ubuntu-latest-gcc:
|
||||||
# runs-on: ubuntu-latest
|
# runs-on: ubuntu-latest
|
||||||
#
|
#
|
||||||
|
|
61
.github/workflows/docker.yml
vendored
Normal file
61
.github/workflows/docker.yml
vendored
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
# This workflow uses actions that are not certified by GitHub.
|
||||||
|
# They are provided by a third-party and are governed by
|
||||||
|
# separate terms of service, privacy policy, and support
|
||||||
|
# documentation.
|
||||||
|
|
||||||
|
# GitHub recommends pinning actions to a commit SHA.
|
||||||
|
# To get a newer version, you will need to update the SHA.
|
||||||
|
# You can also reference a tag or branch, but the action may change without warning.
|
||||||
|
|
||||||
|
name: Publish Docker image
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
push_to_registry:
|
||||||
|
name: Push Docker image to Docker Hub
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
env:
|
||||||
|
COMMIT_SHA: ${{ github.sha }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
config:
|
||||||
|
- { tag: "light", dockerfile: ".devops/main.Dockerfile" }
|
||||||
|
- { tag: "full", dockerfile: ".devops/full.Dockerfile" }
|
||||||
|
steps:
|
||||||
|
- name: Check out the repo
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up QEMU
|
||||||
|
uses: docker/setup-qemu-action@v2
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v2
|
||||||
|
|
||||||
|
- name: Log in to Docker Hub
|
||||||
|
uses: docker/login-action@v2
|
||||||
|
with:
|
||||||
|
registry: ghcr.io
|
||||||
|
username: ${{ github.actor }}
|
||||||
|
password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
|
- name: Build and push Docker image (versioned)
|
||||||
|
if: github.event_name == 'push'
|
||||||
|
uses: docker/build-push-action@v4
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: true
|
||||||
|
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||||
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
|
||||||
|
- name: Build and push Docker image (tagged)
|
||||||
|
uses: docker/build-push-action@v4
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: ${{ github.event_name == 'push' }}
|
||||||
|
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
|
||||||
|
file: ${{ matrix.config.dockerfile }}
|
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -18,6 +18,10 @@ models/*
|
||||||
|
|
||||||
/main
|
/main
|
||||||
/quantize
|
/quantize
|
||||||
|
/result
|
||||||
|
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
compile_commands.json
|
compile_commands.json
|
||||||
|
|
||||||
|
.envrc
|
||||||
|
.direnv/
|
||||||
|
|
|
@ -4,6 +4,8 @@ project("llama.cpp")
|
||||||
set(CMAKE_CXX_STANDARD 20)
|
set(CMAKE_CXX_STANDARD 20)
|
||||||
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
||||||
set(CMAKE_C_STANDARD 11)
|
set(CMAKE_C_STANDARD 11)
|
||||||
|
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||||
|
find_package(Threads REQUIRED)
|
||||||
|
|
||||||
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
||||||
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
||||||
|
@ -126,3 +128,4 @@ target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS})
|
||||||
target_include_directories(ggml PUBLIC .)
|
target_include_directories(ggml PUBLIC .)
|
||||||
target_link_libraries(quantize PRIVATE ggml)
|
target_link_libraries(quantize PRIVATE ggml)
|
||||||
target_link_libraries(llama PRIVATE ggml)
|
target_link_libraries(llama PRIVATE ggml)
|
||||||
|
target_link_libraries(ggml PRIVATE Threads::Threads)
|
||||||
|
|
39
README.md
39
README.md
|
@ -32,13 +32,14 @@ Supported platforms:
|
||||||
- [X] Mac OS
|
- [X] Mac OS
|
||||||
- [X] Linux
|
- [X] Linux
|
||||||
- [X] Windows (via CMake)
|
- [X] Windows (via CMake)
|
||||||
|
- [X] Docker
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
Here is a typical run using LLaMA-7B:
|
Here is a typical run using LLaMA-7B:
|
||||||
|
|
||||||
```java
|
```java
|
||||||
make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
|
make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||||
I llama.cpp build info:
|
I llama.cpp build info:
|
||||||
I UNAME_S: Darwin
|
I UNAME_S: Darwin
|
||||||
I UNAME_P: arm
|
I UNAME_P: arm
|
||||||
|
@ -149,7 +150,7 @@ python3 convert-pth-to-ggml.py models/7B/ 1
|
||||||
./quantize.sh 7B
|
./quantize.sh 7B
|
||||||
|
|
||||||
# run the inference
|
# run the inference
|
||||||
./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128
|
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
|
||||||
```
|
```
|
||||||
|
|
||||||
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
||||||
|
@ -163,7 +164,7 @@ In this mode, you can always interrupt generation by pressing Ctrl+C and enter o
|
||||||
|
|
||||||
Here is an example few-shot interaction, invoked with the command
|
Here is an example few-shot interaction, invoked with the command
|
||||||
```
|
```
|
||||||
./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
|
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
|
||||||
-p \
|
-p \
|
||||||
"Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
"Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
||||||
|
|
||||||
|
@ -194,6 +195,37 @@ Finally, copy the `llama` binary and the model files to your device storage. Her
|
||||||
|
|
||||||
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
|
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
|
||||||
|
|
||||||
|
### Docker
|
||||||
|
|
||||||
|
#### Prerequisites
|
||||||
|
* Docker must be installed and running on your system.
|
||||||
|
* Create a folder to store big models & intermediate files (in ex. im using /llama/models)
|
||||||
|
|
||||||
|
#### Images
|
||||||
|
We have two Docker images available for this project:
|
||||||
|
|
||||||
|
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||||
|
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file.
|
||||||
|
|
||||||
|
#### Usage
|
||||||
|
|
||||||
|
The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
|
||||||
|
```
|
||||||
|
|
||||||
|
On complete, you are ready to play!
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||||
|
```
|
||||||
|
|
||||||
|
or with light image:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||||
|
```
|
||||||
|
|
||||||
## Limitations
|
## Limitations
|
||||||
|
|
||||||
|
@ -210,6 +242,7 @@ https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b0
|
||||||
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
|
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
|
||||||
- Collaborators will be invited based on contributions
|
- Collaborators will be invited based on contributions
|
||||||
- Any help with managing issues and PRs is very appreciated!
|
- Any help with managing issues and PRs is very appreciated!
|
||||||
|
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
|
||||||
|
|
||||||
### Coding guidelines
|
### Coding guidelines
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
# At the start of the ggml file we write the model parameters
|
# At the start of the ggml file we write the model parameters
|
||||||
# and vocabulary.
|
# and vocabulary.
|
||||||
#
|
#
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
import struct
|
import struct
|
||||||
|
@ -59,7 +59,6 @@ def get_n_parts(dim):
|
||||||
print("Invalid dim: " + str(dim))
|
print("Invalid dim: " + str(dim))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
dir_model = args.dir_model
|
dir_model = args.dir_model
|
||||||
|
|
66
download-pth.py
Normal file
66
download-pth.py
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from tqdm import tqdm
|
||||||
|
import requests
|
||||||
|
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print("Usage: download-pth.py dir-model model-type\n")
|
||||||
|
print(" model-type: Available models 7B, 13B, 30B or 65B")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
modelsDir = sys.argv[1]
|
||||||
|
model = sys.argv[2]
|
||||||
|
|
||||||
|
num = {
|
||||||
|
"7B": 1,
|
||||||
|
"13B": 2,
|
||||||
|
"30B": 4,
|
||||||
|
"65B": 8,
|
||||||
|
}
|
||||||
|
|
||||||
|
if model not in num:
|
||||||
|
print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print(f"Downloading model {model}")
|
||||||
|
|
||||||
|
files = ["checklist.chk", "params.json"]
|
||||||
|
|
||||||
|
for i in range(num[model]):
|
||||||
|
files.append(f"consolidated.0{i}.pth")
|
||||||
|
|
||||||
|
resolved_path = os.path.abspath(os.path.join(modelsDir, model))
|
||||||
|
os.makedirs(resolved_path, exist_ok=True)
|
||||||
|
|
||||||
|
for file in files:
|
||||||
|
dest_path = os.path.join(resolved_path, file)
|
||||||
|
|
||||||
|
if os.path.exists(dest_path):
|
||||||
|
print(f"Skip file download, it already exists: {file}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
|
||||||
|
response = requests.get(url, stream=True)
|
||||||
|
with open(dest_path, 'wb') as f:
|
||||||
|
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
|
||||||
|
for chunk in response.iter_content(chunk_size=1024):
|
||||||
|
if chunk:
|
||||||
|
f.write(chunk)
|
||||||
|
t.update(len(chunk))
|
||||||
|
|
||||||
|
files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
|
||||||
|
for file in files2:
|
||||||
|
dest_path = os.path.join(modelsDir, file)
|
||||||
|
|
||||||
|
if os.path.exists(dest_path):
|
||||||
|
print(f"Skip file download, it already exists: {file}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
|
||||||
|
response = requests.get(url, stream=True)
|
||||||
|
with open(dest_path, 'wb') as f:
|
||||||
|
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
|
||||||
|
for chunk in response.iter_content(chunk_size=1024):
|
||||||
|
if chunk:
|
||||||
|
f.write(chunk)
|
||||||
|
t.update(len(chunk))
|
43
flake.lock
generated
Normal file
43
flake.lock
generated
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
{
|
||||||
|
"nodes": {
|
||||||
|
"flake-utils": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1676283394,
|
||||||
|
"narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=",
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nixpkgs": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1678470307,
|
||||||
|
"narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=",
|
||||||
|
"owner": "NixOS",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "NixOS",
|
||||||
|
"ref": "nixos-unstable",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": {
|
||||||
|
"inputs": {
|
||||||
|
"flake-utils": "flake-utils",
|
||||||
|
"nixpkgs": "nixpkgs"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": "root",
|
||||||
|
"version": 7
|
||||||
|
}
|
48
flake.nix
Normal file
48
flake.nix
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
{
|
||||||
|
inputs = {
|
||||||
|
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
|
||||||
|
flake-utils.url = "github:numtide/flake-utils";
|
||||||
|
};
|
||||||
|
outputs = { self, nixpkgs, flake-utils }:
|
||||||
|
flake-utils.lib.eachDefaultSystem (system:
|
||||||
|
let
|
||||||
|
pkgs = import nixpkgs {
|
||||||
|
inherit system;
|
||||||
|
};
|
||||||
|
llama-python = pkgs.python310.withPackages (ps: with ps; [
|
||||||
|
torch
|
||||||
|
numpy
|
||||||
|
sentencepiece
|
||||||
|
]);
|
||||||
|
in
|
||||||
|
{
|
||||||
|
packages.default = pkgs.stdenv.mkDerivation {
|
||||||
|
name = "llama.cpp";
|
||||||
|
src = ./.;
|
||||||
|
nativeBuildInputs = with pkgs; [ cmake ];
|
||||||
|
buildInputs = with pkgs; lib.optionals stdenv.isDarwin [
|
||||||
|
darwin.apple_sdk.frameworks.Accelerate
|
||||||
|
];
|
||||||
|
cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [
|
||||||
|
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
||||||
|
];
|
||||||
|
installPhase = ''
|
||||||
|
mkdir -p $out/bin
|
||||||
|
mv llama $out/bin/llama
|
||||||
|
mv quantize $out/bin/quantize
|
||||||
|
echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
|
||||||
|
cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
|
||||||
|
chmod +x $out/bin/convert-pth-to-ggml
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
devShells.default = pkgs.mkShell {
|
||||||
|
packages = with pkgs; [
|
||||||
|
cmake
|
||||||
|
llama-python
|
||||||
|
] ++ lib.optionals stdenv.isDarwin [
|
||||||
|
darwin.apple_sdk.frameworks.Accelerate
|
||||||
|
];
|
||||||
|
};
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
4
ggml.c
4
ggml.c
|
@ -9318,10 +9318,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
|
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
|
||||||
if (cgraph->n_threads <= 0) {
|
|
||||||
cgraph->n_threads = 8;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int n_threads = cgraph->n_threads;
|
const int n_threads = cgraph->n_threads;
|
||||||
|
|
||||||
struct ggml_compute_state_shared state_shared = {
|
struct ggml_compute_state_shared state_shared = {
|
||||||
|
|
2
main.cpp
2
main.cpp
|
@ -845,6 +845,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::vector<float> logits;
|
std::vector<float> logits;
|
||||||
|
|
||||||
|
// Add a space in front of the first character to match OG llama tokenizer behavior
|
||||||
|
params.prompt.insert(0, 1, ' ');
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
|
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
|
||||||
|
|
||||||
|
|
82
utils.cpp
82
utils.cpp
|
@ -16,6 +16,18 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
|
// determine sensible default number of threads.
|
||||||
|
// std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
|
||||||
|
#ifdef __linux__
|
||||||
|
std::ifstream cpuinfo("/proc/cpuinfo");
|
||||||
|
params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
|
||||||
|
std::istream_iterator<std::string>(),
|
||||||
|
std::string("processor"));
|
||||||
|
#endif
|
||||||
|
if (params.n_threads == 0) {
|
||||||
|
params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
std::string arg = argv[i];
|
std::string arg = argv[i];
|
||||||
|
|
||||||
|
@ -275,40 +287,56 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
|
||||||
return tokens;
|
return tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: Calculate this constant from the vocabulary
|
||||||
|
#define MAX_TOKEN_LEN 18
|
||||||
|
// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
|
||||||
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
|
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
|
||||||
//auto res = gpt_tokenize(vocab, text);
|
|
||||||
|
|
||||||
//if (bos) {
|
|
||||||
// res.insert(res.begin(), 1); // TODO: replace with vocab.bos
|
|
||||||
//}
|
|
||||||
|
|
||||||
std::vector<gpt_vocab::id> res;
|
std::vector<gpt_vocab::id> res;
|
||||||
|
std::vector<int> score;
|
||||||
|
std::vector<gpt_vocab::id> prev;
|
||||||
|
int len = text.length();
|
||||||
|
|
||||||
|
score.resize(len + 1);
|
||||||
|
prev.resize(len + 1);
|
||||||
|
|
||||||
|
// Forward pass
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
int max_len = std::min(len - i, MAX_TOKEN_LEN);
|
||||||
|
for (int sub_len = 1; sub_len <= len - i; sub_len++) {
|
||||||
|
auto sub = text.substr(i, sub_len);
|
||||||
|
auto token = vocab.token_to_id.find(sub);
|
||||||
|
if (token != vocab.token_to_id.end()) {
|
||||||
|
int token_score = sub.length() * sub.length();
|
||||||
|
int local_score = score[i] + token_score;
|
||||||
|
int next = i + sub_len;
|
||||||
|
if (score[next] < local_score) {
|
||||||
|
score[next] = local_score;
|
||||||
|
prev[next] = (*token).second;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Backward pass
|
||||||
|
int i = len;
|
||||||
|
while (i > 0) {
|
||||||
|
gpt_vocab::id token_id = prev[i];
|
||||||
|
if (token_id == 0) {
|
||||||
|
// TODO: Return error or something more meaningful
|
||||||
|
printf("failed to tokenize string!\n");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
res.push_back(token_id);
|
||||||
|
auto token = (*vocab.id_to_token.find(token_id)).second;
|
||||||
|
i -= token.length();
|
||||||
|
}
|
||||||
|
|
||||||
if (bos) {
|
if (bos) {
|
||||||
res.push_back(1); // TODO: replace with vocab.bos
|
res.push_back(1); // TODO: replace with vocab.bos
|
||||||
}
|
}
|
||||||
|
|
||||||
//find the longest token that matches the text
|
// Pieces are in reverse order so correct that
|
||||||
int pos = 0;
|
std::reverse(res.begin(), res.end());
|
||||||
while (true) {
|
|
||||||
int l = 0;
|
|
||||||
int t = 0;
|
|
||||||
for (const auto & kv : vocab.id_to_token) {
|
|
||||||
if (kv.second.size() < l) continue;
|
|
||||||
if (kv.second.size() > text.size() - pos) continue;
|
|
||||||
if (text.substr(pos, kv.second.size()) == kv.second) {
|
|
||||||
l = kv.second.size();
|
|
||||||
t = kv.first;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (l == 0) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
res.push_back(t);
|
|
||||||
pos += l;
|
|
||||||
}
|
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
2
utils.h
2
utils.h
|
@ -18,7 +18,7 @@ struct gpt_params {
|
||||||
int32_t n_predict = 128; // new tokens to predict
|
int32_t n_predict = 128; // new tokens to predict
|
||||||
int32_t repeat_last_n = 64; // last n tokens to penalize
|
int32_t repeat_last_n = 64; // last n tokens to penalize
|
||||||
int32_t n_ctx = 512; //context size
|
int32_t n_ctx = 512; //context size
|
||||||
|
|
||||||
// sampling parameters
|
// sampling parameters
|
||||||
int32_t top_k = 40;
|
int32_t top_k = 40;
|
||||||
float top_p = 0.95f;
|
float top_p = 0.95f;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue