Merge remote-tracking branch 'upstream/master' into suajcarrot-changes
This commit is contained in:
commit
5d864c1d69
19 changed files with 864 additions and 251 deletions
17
.devops/full.Dockerfile
Normal file
17
.devops/full.Dockerfile
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential python3 python3-pip
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install torch torchvision torchaudio sentencepiece numpy
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
18
.devops/main.Dockerfile
Normal file
18
.devops/main.Dockerfile
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/main /main
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/main" ]
|
46
.devops/tools.sh
Executable file
46
.devops/tools.sh
Executable file
|
@ -0,0 +1,46 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Read the first argument into a variable
|
||||||
|
arg1="$1"
|
||||||
|
|
||||||
|
# Shift the arguments to remove the first one
|
||||||
|
shift
|
||||||
|
|
||||||
|
# Join the remaining arguments into a single string
|
||||||
|
arg2="$@"
|
||||||
|
|
||||||
|
if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
|
||||||
|
python3 ./convert-pth-to-ggml.py $arg2
|
||||||
|
elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
|
||||||
|
./quantize $arg2
|
||||||
|
elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
|
||||||
|
./main $arg2
|
||||||
|
elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
|
||||||
|
python3 ./download-pth.py $arg2
|
||||||
|
elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
|
||||||
|
echo "Downloading model..."
|
||||||
|
python3 ./download-pth.py "$1" "$2"
|
||||||
|
echo "Converting PTH to GGML..."
|
||||||
|
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||||
|
if [ -f "${i/f16/q4_0}" ]; then
|
||||||
|
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
||||||
|
else
|
||||||
|
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
||||||
|
./quantize "$i" "${i/f16/q4_0}" 2
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
else
|
||||||
|
echo "Unknown command: $arg1"
|
||||||
|
echo "Available commands: "
|
||||||
|
echo " --run (-r): Run a model previously converted into ggml"
|
||||||
|
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
||||||
|
echo " --convert (-c): Convert a llama model into ggml"
|
||||||
|
echo " ex: \"/models/7B/\" 1"
|
||||||
|
echo " --quantize (-q): Optimize with quantization process ggml"
|
||||||
|
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
||||||
|
echo " --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
|
||||||
|
echo " ex: \"/models/\" 7B"
|
||||||
|
echo " --all-in-one (-a): Execute --download, --convert & --quantize"
|
||||||
|
echo " ex: \"/models/\" 7B"
|
||||||
|
fi
|
24
.dockerignore
Normal file
24
.dockerignore
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
*.o
|
||||||
|
*.a
|
||||||
|
.cache/
|
||||||
|
.vs/
|
||||||
|
.vscode/
|
||||||
|
.DS_Store
|
||||||
|
|
||||||
|
build/
|
||||||
|
build-em/
|
||||||
|
build-debug/
|
||||||
|
build-release/
|
||||||
|
build-static/
|
||||||
|
build-no-accel/
|
||||||
|
build-sanitize-addr/
|
||||||
|
build-sanitize-thread/
|
||||||
|
|
||||||
|
models/*
|
||||||
|
|
||||||
|
/main
|
||||||
|
/quantize
|
||||||
|
|
||||||
|
arm_neon.h
|
||||||
|
compile_commands.json
|
||||||
|
Dockerfile
|
102
.github/workflows/build.yml
vendored
102
.github/workflows/build.yml
vendored
|
@ -1,8 +1,42 @@
|
||||||
name: CI
|
name: CI
|
||||||
on: [push, pull_request]
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
inputs:
|
||||||
|
create_release:
|
||||||
|
description: 'Create new release'
|
||||||
|
required: true
|
||||||
|
type: boolean
|
||||||
|
push:
|
||||||
|
paths: ['.github/workflows/**', 'CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
|
||||||
|
paths: ['CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
|
||||||
|
|
||||||
|
env:
|
||||||
|
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
ubuntu-latest:
|
ubuntu-latest-make:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v1
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install build-essential
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: make_build
|
||||||
|
run: |
|
||||||
|
make
|
||||||
|
|
||||||
|
ubuntu-latest-cmake:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
@ -15,10 +49,31 @@ jobs:
|
||||||
sudo apt-get install build-essential
|
sudo apt-get install build-essential
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
|
run: |
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake ..
|
||||||
|
cmake --build . --config Release
|
||||||
|
|
||||||
|
macOS-latest-make:
|
||||||
|
runs-on: macos-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v1
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
brew update
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: make_build
|
||||||
run: |
|
run: |
|
||||||
make
|
make
|
||||||
|
|
||||||
macOS-latest:
|
macOS-latest-cmake:
|
||||||
runs-on: macOS-latest
|
runs-on: macOS-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
@ -31,22 +86,59 @@ jobs:
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
run: |
|
run: |
|
||||||
make
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake ..
|
||||||
|
cmake --build . --config Release
|
||||||
|
|
||||||
windows-latest:
|
windows-latest-cmake:
|
||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v1
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake ..
|
cmake ..
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
|
|
||||||
|
- name: Get commit hash
|
||||||
|
id: commit
|
||||||
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
uses: pr-mpt/actions-commit-hash@v2
|
||||||
|
|
||||||
|
- name: Pack artifacts
|
||||||
|
id: pack_artifacts
|
||||||
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
run: |
|
||||||
|
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip .\build\Release\*
|
||||||
|
|
||||||
|
- name: Create release
|
||||||
|
id: create_release
|
||||||
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
uses: zendesk/action-create-release@v1
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
with:
|
||||||
|
tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
|
||||||
|
|
||||||
|
- name: Upload release
|
||||||
|
id: upload_release
|
||||||
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
uses: actions/upload-release-asset@v1
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
with:
|
||||||
|
upload_url: ${{ steps.create_release.outputs.upload_url }}
|
||||||
|
asset_path: .\llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
|
||||||
|
asset_name: llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
|
||||||
|
asset_content_type: application/octet-stream
|
||||||
|
|
||||||
# ubuntu-latest-gcc:
|
# ubuntu-latest-gcc:
|
||||||
# runs-on: ubuntu-latest
|
# runs-on: ubuntu-latest
|
||||||
#
|
#
|
||||||
|
|
61
.github/workflows/docker.yml
vendored
Normal file
61
.github/workflows/docker.yml
vendored
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
# This workflow uses actions that are not certified by GitHub.
|
||||||
|
# They are provided by a third-party and are governed by
|
||||||
|
# separate terms of service, privacy policy, and support
|
||||||
|
# documentation.
|
||||||
|
|
||||||
|
# GitHub recommends pinning actions to a commit SHA.
|
||||||
|
# To get a newer version, you will need to update the SHA.
|
||||||
|
# You can also reference a tag or branch, but the action may change without warning.
|
||||||
|
|
||||||
|
name: Publish Docker image
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
push_to_registry:
|
||||||
|
name: Push Docker image to Docker Hub
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
env:
|
||||||
|
COMMIT_SHA: ${{ github.sha }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
config:
|
||||||
|
- { tag: "light", dockerfile: ".devops/main.Dockerfile" }
|
||||||
|
- { tag: "full", dockerfile: ".devops/full.Dockerfile" }
|
||||||
|
steps:
|
||||||
|
- name: Check out the repo
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up QEMU
|
||||||
|
uses: docker/setup-qemu-action@v2
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v2
|
||||||
|
|
||||||
|
- name: Log in to Docker Hub
|
||||||
|
uses: docker/login-action@v2
|
||||||
|
with:
|
||||||
|
registry: ghcr.io
|
||||||
|
username: ${{ github.actor }}
|
||||||
|
password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
|
- name: Build and push Docker image (versioned)
|
||||||
|
if: github.event_name == 'push'
|
||||||
|
uses: docker/build-push-action@v4
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: true
|
||||||
|
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||||
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
|
||||||
|
- name: Build and push Docker image (tagged)
|
||||||
|
uses: docker/build-push-action@v4
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: ${{ github.event_name == 'push' }}
|
||||||
|
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
|
||||||
|
file: ${{ matrix.config.dockerfile }}
|
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -18,6 +18,10 @@ models/*
|
||||||
|
|
||||||
/main
|
/main
|
||||||
/quantize
|
/quantize
|
||||||
|
/result
|
||||||
|
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
compile_commands.json
|
compile_commands.json
|
||||||
|
|
||||||
|
.envrc
|
||||||
|
.direnv/
|
||||||
|
|
|
@ -4,6 +4,8 @@ project("llama.cpp")
|
||||||
set(CMAKE_CXX_STANDARD 20)
|
set(CMAKE_CXX_STANDARD 20)
|
||||||
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
||||||
set(CMAKE_C_STANDARD 11)
|
set(CMAKE_C_STANDARD 11)
|
||||||
|
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||||
|
find_package(Threads REQUIRED)
|
||||||
|
|
||||||
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
||||||
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
||||||
|
@ -126,3 +128,4 @@ target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS})
|
||||||
target_include_directories(ggml PUBLIC .)
|
target_include_directories(ggml PUBLIC .)
|
||||||
target_link_libraries(quantize PRIVATE ggml)
|
target_link_libraries(quantize PRIVATE ggml)
|
||||||
target_link_libraries(llama PRIVATE ggml)
|
target_link_libraries(llama PRIVATE ggml)
|
||||||
|
target_link_libraries(ggml PRIVATE Threads::Threads)
|
||||||
|
|
101
README.md
101
README.md
|
@ -7,7 +7,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||||
|
|
||||||
**Hot topics:**
|
**Hot topics:**
|
||||||
|
|
||||||
- RMSNorm implementation / fixes: https://github.com/ggerganov/llama.cpp/issues/173
|
- [Added Alpaca support](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
|
||||||
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
|
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
|
||||||
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
|
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
|
||||||
|
|
||||||
|
@ -32,13 +32,14 @@ Supported platforms:
|
||||||
- [X] Mac OS
|
- [X] Mac OS
|
||||||
- [X] Linux
|
- [X] Linux
|
||||||
- [X] Windows (via CMake)
|
- [X] Windows (via CMake)
|
||||||
|
- [X] Docker
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
Here is a typical run using LLaMA-7B:
|
Here is a typical run using LLaMA-7B:
|
||||||
|
|
||||||
```java
|
```java
|
||||||
make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
|
make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||||
I llama.cpp build info:
|
I llama.cpp build info:
|
||||||
I UNAME_S: Darwin
|
I UNAME_S: Darwin
|
||||||
I UNAME_P: arm
|
I UNAME_P: arm
|
||||||
|
@ -149,12 +150,24 @@ python3 convert-pth-to-ggml.py models/7B/ 1
|
||||||
python3 quantize.py 7B
|
python3 quantize.py 7B
|
||||||
|
|
||||||
# run the inference
|
# run the inference
|
||||||
./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128
|
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Currently, it's best to use Python 3.9 or Python 3.10, as `sentencepiece` has not yet published a wheel for Python 3.11.
|
||||||
|
|
||||||
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
||||||
|
|
||||||
TODO: add model disk/mem requirements
|
### Memory/Disk Requirements
|
||||||
|
|
||||||
|
As the models are currently fully loaded into memory, you will need adequate disk space to save them
|
||||||
|
and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
|
||||||
|
|
||||||
|
| model | original size | quantized size (4-bit) |
|
||||||
|
|-------|---------------|------------------------|
|
||||||
|
| 7B | 13 GB | 3.9 GB |
|
||||||
|
| 13B | 24 GB | 7.8 GB |
|
||||||
|
| 30B | 60 GB | 19.5 GB |
|
||||||
|
| 65B | 120 GB | 38.5 GB |
|
||||||
|
|
||||||
### Interactive mode
|
### Interactive mode
|
||||||
|
|
||||||
|
@ -163,21 +176,51 @@ In this mode, you can always interrupt generation by pressing Ctrl+C and enter o
|
||||||
|
|
||||||
Here is an example few-shot interaction, invoked with the command
|
Here is an example few-shot interaction, invoked with the command
|
||||||
```
|
```
|
||||||
./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
|
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||||
-p \
|
|
||||||
"Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
|
||||||
|
|
||||||
User: Hello, Bob.
|
|
||||||
Bob: Hello. How may I help you today?
|
|
||||||
User: Please tell me the largest city in Europe.
|
|
||||||
Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
|
|
||||||
User:"
|
|
||||||
|
|
||||||
```
|
```
|
||||||
Note the use of `--color` to distinguish between user input and generated text.
|
Note the use of `--color` to distinguish between user input and generated text.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
|
### Instruction mode with Alpaca
|
||||||
|
|
||||||
|
First, download the `ggml` Alpaca model into the `./models` folder:
|
||||||
|
|
||||||
|
```
|
||||||
|
# use one of these
|
||||||
|
# NOTE: these are copied from the alpaca.cpp repo - not sure how long these will work
|
||||||
|
# TODO: add a script to simplify the download
|
||||||
|
curl -o ggml-alpaca-7b-q4.bin -C - https://gateway.estuary.tech/gw/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC
|
||||||
|
curl -o ggml-alpaca-7b-q4.bin -C - https://ipfs.io/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC
|
||||||
|
curl -o ggml-alpaca-7b-q4.bin -C - https://cloudflare-ipfs.com/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC
|
||||||
|
```
|
||||||
|
|
||||||
|
Now run the `main` tool like this:
|
||||||
|
|
||||||
|
```
|
||||||
|
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins
|
||||||
|
```
|
||||||
|
|
||||||
|
Sample run:
|
||||||
|
|
||||||
|
```
|
||||||
|
== Running in interactive mode. ==
|
||||||
|
- Press Ctrl+C to interject at any time.
|
||||||
|
- Press Return to return control to LLaMa.
|
||||||
|
- If you want to submit another line, end your input in '\'.
|
||||||
|
|
||||||
|
Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
||||||
|
|
||||||
|
> How many letters are there in the English alphabet?
|
||||||
|
There 26 letters in the English Alphabet
|
||||||
|
> What is the most common way of transportation in Amsterdam?
|
||||||
|
The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
|
||||||
|
> List 5 words that start with "ca".
|
||||||
|
cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
|
||||||
|
>
|
||||||
|
```
|
||||||
|
|
||||||
### Android
|
### Android
|
||||||
|
|
||||||
You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux).
|
You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux).
|
||||||
|
@ -194,6 +237,37 @@ Finally, copy the `llama` binary and the model files to your device storage. Her
|
||||||
|
|
||||||
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
|
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
|
||||||
|
|
||||||
|
### Docker
|
||||||
|
|
||||||
|
#### Prerequisites
|
||||||
|
* Docker must be installed and running on your system.
|
||||||
|
* Create a folder to store big models & intermediate files (in ex. im using /llama/models)
|
||||||
|
|
||||||
|
#### Images
|
||||||
|
We have two Docker images available for this project:
|
||||||
|
|
||||||
|
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||||
|
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file.
|
||||||
|
|
||||||
|
#### Usage
|
||||||
|
|
||||||
|
The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
|
||||||
|
```
|
||||||
|
|
||||||
|
On complete, you are ready to play!
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||||
|
```
|
||||||
|
|
||||||
|
or with light image:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||||
|
```
|
||||||
|
|
||||||
## Limitations
|
## Limitations
|
||||||
|
|
||||||
|
@ -210,6 +284,7 @@ https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b0
|
||||||
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
|
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
|
||||||
- Collaborators will be invited based on contributions
|
- Collaborators will be invited based on contributions
|
||||||
- Any help with managing issues and PRs is very appreciated!
|
- Any help with managing issues and PRs is very appreciated!
|
||||||
|
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
|
||||||
|
|
||||||
### Coding guidelines
|
### Coding guidelines
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
# At the start of the ggml file we write the model parameters
|
# At the start of the ggml file we write the model parameters
|
||||||
# and vocabulary.
|
# and vocabulary.
|
||||||
#
|
#
|
||||||
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
import struct
|
import struct
|
||||||
|
@ -24,132 +24,81 @@ import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from sentencepiece import SentencePieceProcessor
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
if len(sys.argv) < 3:
|
def parse_args():
|
||||||
print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n")
|
|
||||||
print(" ftype == 0 -> float32")
|
|
||||||
print(" ftype == 1 -> float16")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# output in the same directory as the model
|
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
|
||||||
dir_model = sys.argv[1]
|
parser.add_argument('dir_model', help='directory containing the model checkpoint')
|
||||||
|
parser.add_argument('ftype', type=int, choices=[0, 1], default=1, help='file type (0: float32, 1: float16)')
|
||||||
fname_hparams = sys.argv[1] + "/params.json"
|
return parser.parse_args()
|
||||||
fname_tokenizer = sys.argv[1] + "/../tokenizer.model"
|
|
||||||
|
|
||||||
def get_n_parts(dim):
|
def get_n_parts(dim):
|
||||||
if dim == 4096:
|
|
||||||
return 1
|
mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
|
||||||
elif dim == 5120:
|
n_parts = mappings.get(dim)
|
||||||
return 2
|
if n_parts is None:
|
||||||
elif dim == 6656:
|
print(f"Invalid dim: {dim}")
|
||||||
return 4
|
|
||||||
elif dim == 8192:
|
|
||||||
return 8
|
|
||||||
else:
|
|
||||||
print("Invalid dim: " + str(dim))
|
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# possible data types
|
print(f"n_parts = {n_parts}\n")
|
||||||
# ftype == 0 -> float32
|
return n_parts
|
||||||
# ftype == 1 -> float16
|
|
||||||
#
|
|
||||||
# map from ftype to string
|
|
||||||
ftype_str = ["f32", "f16"]
|
|
||||||
|
|
||||||
ftype = 1
|
def load_hparams_and_tokenizer(dir_model):
|
||||||
if len(sys.argv) > 2:
|
|
||||||
ftype = int(sys.argv[2])
|
|
||||||
if ftype < 0 or ftype > 1:
|
|
||||||
print("Invalid ftype: " + str(ftype))
|
|
||||||
sys.exit(1)
|
|
||||||
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
|
|
||||||
|
|
||||||
with open(fname_hparams, "r") as f:
|
fname_hparams = f"{dir_model}/params.json"
|
||||||
|
fname_tokenizer = f"{dir_model}/../tokenizer.model"
|
||||||
|
|
||||||
|
with open(fname_hparams, "r") as f:
|
||||||
hparams = json.load(f)
|
hparams = json.load(f)
|
||||||
|
print(hparams)
|
||||||
|
|
||||||
tokenizer = SentencePieceProcessor(fname_tokenizer)
|
tokenizer = SentencePieceProcessor(fname_tokenizer)
|
||||||
|
hparams.update({"vocab_size": tokenizer.vocab_size()})
|
||||||
|
|
||||||
hparams.update({"vocab_size": tokenizer.vocab_size()})
|
return hparams, tokenizer
|
||||||
|
|
||||||
n_parts = get_n_parts(hparams["dim"])
|
def write_header(fout, hparams, ftype):
|
||||||
|
|
||||||
print(hparams)
|
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
||||||
print('n_parts = ', n_parts)
|
values = [
|
||||||
|
0x67676d6c, # magic: ggml in hex
|
||||||
|
*[hparams[key] for key in keys],
|
||||||
|
hparams["dim"] // hparams["n_heads"], # rot (obsolete)
|
||||||
|
ftype
|
||||||
|
]
|
||||||
|
fout.write(struct.pack("i" * len(values), *values))
|
||||||
|
|
||||||
for p in range(n_parts):
|
def write_tokens(fout, tokenizer):
|
||||||
print('Processing part ', p)
|
|
||||||
|
|
||||||
#fname_model = sys.argv[1] + "/consolidated.00.pth"
|
|
||||||
fname_model = sys.argv[1] + "/consolidated.0" + str(p) + ".pth"
|
|
||||||
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
|
|
||||||
if (p > 0):
|
|
||||||
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + "." + str(p)
|
|
||||||
|
|
||||||
model = torch.load(fname_model, map_location="cpu")
|
|
||||||
|
|
||||||
fout = open(fname_out, "wb")
|
|
||||||
|
|
||||||
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
|
|
||||||
fout.write(struct.pack("i", hparams["vocab_size"]))
|
|
||||||
fout.write(struct.pack("i", hparams["dim"]))
|
|
||||||
fout.write(struct.pack("i", hparams["multiple_of"]))
|
|
||||||
fout.write(struct.pack("i", hparams["n_heads"]))
|
|
||||||
fout.write(struct.pack("i", hparams["n_layers"]))
|
|
||||||
fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete)
|
|
||||||
fout.write(struct.pack("i", ftype))
|
|
||||||
|
|
||||||
# Is this correct??
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
for i in range(tokenizer.vocab_size()):
|
||||||
if tokenizer.is_unknown(i):
|
if tokenizer.is_unknown(i):
|
||||||
# "<unk>" token (translated as ??)
|
|
||||||
text = " \u2047 ".encode("utf-8")
|
text = " \u2047 ".encode("utf-8")
|
||||||
fout.write(struct.pack("i", len(text)))
|
|
||||||
fout.write(text)
|
|
||||||
elif tokenizer.is_control(i):
|
elif tokenizer.is_control(i):
|
||||||
# "<s>"/"</s>" tokens
|
text = b""
|
||||||
fout.write(struct.pack("i", 0))
|
|
||||||
elif tokenizer.is_byte(i):
|
elif tokenizer.is_byte(i):
|
||||||
# "<U+XX>" tokens (which may be invalid UTF-8)
|
|
||||||
piece = tokenizer.id_to_piece(i)
|
piece = tokenizer.id_to_piece(i)
|
||||||
if len(piece) != 6:
|
if len(piece) != 6:
|
||||||
print("Invalid token: " + piece)
|
print(f"Invalid token: {piece}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
byte_value = int(piece[3:-1], 16)
|
byte_value = int(piece[3:-1], 16)
|
||||||
fout.write(struct.pack("i", 1))
|
text = struct.pack("B", byte_value)
|
||||||
fout.write(struct.pack("B", byte_value))
|
|
||||||
else:
|
else:
|
||||||
# normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
|
|
||||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
||||||
fout.write(struct.pack("i", len(text)))
|
fout.write(struct.pack("i", len(text)))
|
||||||
fout.write(text)
|
fout.write(text)
|
||||||
|
|
||||||
for k, v in model.items():
|
def process_and_write_variables(fout, model, ftype):
|
||||||
name = k
|
|
||||||
shape = v.shape
|
|
||||||
|
|
||||||
# skip layers.X.attention.inner_attention.rope.freqs
|
for name, datao in model.items():
|
||||||
if name[-5:] == "freqs":
|
|
||||||
|
if name.endswith("freqs"):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
|
shape = datao.shape
|
||||||
|
|
||||||
#data = tf.train.load_variable(dir_model, name).squeeze()
|
print(f"Processing variable: {name} with shape: {shape} and type: {datao.dtype}")
|
||||||
data = v.numpy().squeeze()
|
|
||||||
n_dims = len(data.shape);
|
|
||||||
|
|
||||||
# for efficiency - transpose some matrices
|
data = datao.numpy().squeeze()
|
||||||
# "model/h.*/attn/c_attn/w"
|
n_dims = len(shape)
|
||||||
# "model/h.*/attn/c_proj/w"
|
|
||||||
# "model/h.*/mlp/c_fc/w"
|
|
||||||
# "model/h.*/mlp/c_proj/w"
|
|
||||||
#if name[-14:] == "/attn/c_attn/w" or \
|
|
||||||
# name[-14:] == "/attn/c_proj/w" or \
|
|
||||||
# name[-11:] == "/mlp/c_fc/w" or \
|
|
||||||
# name[-13:] == "/mlp/c_proj/w":
|
|
||||||
# print(" Transposing")
|
|
||||||
# data = data.transpose()
|
|
||||||
|
|
||||||
dshape = data.shape
|
|
||||||
|
|
||||||
# default type is fp16
|
# default type is fp16
|
||||||
ftype_cur = 1
|
ftype_cur = 1
|
||||||
|
@ -160,18 +109,40 @@ for p in range(n_parts):
|
||||||
|
|
||||||
# header
|
# header
|
||||||
sname = name.encode('utf-8')
|
sname = name.encode('utf-8')
|
||||||
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
|
fout.write(struct.pack("iii", len(data.shape), len(sname), ftype_cur))
|
||||||
for i in range(n_dims):
|
for dim in reversed(data.shape):
|
||||||
fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
|
fout.write(struct.pack("i", dim))
|
||||||
fout.write(sname);
|
fout.write(sname)
|
||||||
|
|
||||||
# data
|
# data output to file
|
||||||
data.tofile(fout)
|
data.tofile(fout)
|
||||||
|
|
||||||
# I hope this deallocates the memory ..
|
def main():
|
||||||
model = None
|
|
||||||
|
|
||||||
fout.close()
|
args = parse_args()
|
||||||
|
dir_model = args.dir_model
|
||||||
|
ftype = args.ftype
|
||||||
|
ftype_str = ["f32", "f16"]
|
||||||
|
|
||||||
print("Done. Output file: " + fname_out + ", (part ", p, ")")
|
hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
|
||||||
print("")
|
n_parts = get_n_parts(hparams["dim"])
|
||||||
|
|
||||||
|
for p in range(n_parts):
|
||||||
|
|
||||||
|
print(f"Processing part {p}\n")
|
||||||
|
|
||||||
|
fname_model = f"{dir_model}/consolidated.0{p}.pth"
|
||||||
|
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}"
|
||||||
|
|
||||||
|
model = torch.load(fname_model, map_location="cpu")
|
||||||
|
|
||||||
|
with open(fname_out, "wb") as fout:
|
||||||
|
write_header(fout, hparams, ftype)
|
||||||
|
write_tokens(fout, tokenizer)
|
||||||
|
process_and_write_variables(fout, model, ftype)
|
||||||
|
|
||||||
|
del model
|
||||||
|
print(f"Done. Output file: {fname_out}, (part {p})\n")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
66
download-pth.py
Normal file
66
download-pth.py
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from tqdm import tqdm
|
||||||
|
import requests
|
||||||
|
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print("Usage: download-pth.py dir-model model-type\n")
|
||||||
|
print(" model-type: Available models 7B, 13B, 30B or 65B")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
modelsDir = sys.argv[1]
|
||||||
|
model = sys.argv[2]
|
||||||
|
|
||||||
|
num = {
|
||||||
|
"7B": 1,
|
||||||
|
"13B": 2,
|
||||||
|
"30B": 4,
|
||||||
|
"65B": 8,
|
||||||
|
}
|
||||||
|
|
||||||
|
if model not in num:
|
||||||
|
print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print(f"Downloading model {model}")
|
||||||
|
|
||||||
|
files = ["checklist.chk", "params.json"]
|
||||||
|
|
||||||
|
for i in range(num[model]):
|
||||||
|
files.append(f"consolidated.0{i}.pth")
|
||||||
|
|
||||||
|
resolved_path = os.path.abspath(os.path.join(modelsDir, model))
|
||||||
|
os.makedirs(resolved_path, exist_ok=True)
|
||||||
|
|
||||||
|
for file in files:
|
||||||
|
dest_path = os.path.join(resolved_path, file)
|
||||||
|
|
||||||
|
if os.path.exists(dest_path):
|
||||||
|
print(f"Skip file download, it already exists: {file}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
|
||||||
|
response = requests.get(url, stream=True)
|
||||||
|
with open(dest_path, 'wb') as f:
|
||||||
|
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
|
||||||
|
for chunk in response.iter_content(chunk_size=1024):
|
||||||
|
if chunk:
|
||||||
|
f.write(chunk)
|
||||||
|
t.update(len(chunk))
|
||||||
|
|
||||||
|
files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
|
||||||
|
for file in files2:
|
||||||
|
dest_path = os.path.join(modelsDir, file)
|
||||||
|
|
||||||
|
if os.path.exists(dest_path):
|
||||||
|
print(f"Skip file download, it already exists: {file}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
|
||||||
|
response = requests.get(url, stream=True)
|
||||||
|
with open(dest_path, 'wb') as f:
|
||||||
|
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
|
||||||
|
for chunk in response.iter_content(chunk_size=1024):
|
||||||
|
if chunk:
|
||||||
|
f.write(chunk)
|
||||||
|
t.update(len(chunk))
|
43
flake.lock
generated
Normal file
43
flake.lock
generated
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
{
|
||||||
|
"nodes": {
|
||||||
|
"flake-utils": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1676283394,
|
||||||
|
"narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=",
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nixpkgs": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1678470307,
|
||||||
|
"narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=",
|
||||||
|
"owner": "NixOS",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "NixOS",
|
||||||
|
"ref": "nixos-unstable",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": {
|
||||||
|
"inputs": {
|
||||||
|
"flake-utils": "flake-utils",
|
||||||
|
"nixpkgs": "nixpkgs"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": "root",
|
||||||
|
"version": 7
|
||||||
|
}
|
48
flake.nix
Normal file
48
flake.nix
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
{
|
||||||
|
inputs = {
|
||||||
|
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
|
||||||
|
flake-utils.url = "github:numtide/flake-utils";
|
||||||
|
};
|
||||||
|
outputs = { self, nixpkgs, flake-utils }:
|
||||||
|
flake-utils.lib.eachDefaultSystem (system:
|
||||||
|
let
|
||||||
|
pkgs = import nixpkgs {
|
||||||
|
inherit system;
|
||||||
|
};
|
||||||
|
llama-python = pkgs.python310.withPackages (ps: with ps; [
|
||||||
|
torch
|
||||||
|
numpy
|
||||||
|
sentencepiece
|
||||||
|
]);
|
||||||
|
in
|
||||||
|
{
|
||||||
|
packages.default = pkgs.stdenv.mkDerivation {
|
||||||
|
name = "llama.cpp";
|
||||||
|
src = ./.;
|
||||||
|
nativeBuildInputs = with pkgs; [ cmake ];
|
||||||
|
buildInputs = with pkgs; lib.optionals stdenv.isDarwin [
|
||||||
|
darwin.apple_sdk.frameworks.Accelerate
|
||||||
|
];
|
||||||
|
cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [
|
||||||
|
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
||||||
|
];
|
||||||
|
installPhase = ''
|
||||||
|
mkdir -p $out/bin
|
||||||
|
mv llama $out/bin/llama
|
||||||
|
mv quantize $out/bin/quantize
|
||||||
|
echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
|
||||||
|
cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
|
||||||
|
chmod +x $out/bin/convert-pth-to-ggml
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
devShells.default = pkgs.mkShell {
|
||||||
|
packages = with pkgs; [
|
||||||
|
cmake
|
||||||
|
llama-python
|
||||||
|
] ++ lib.optionals stdenv.isDarwin [
|
||||||
|
darwin.apple_sdk.frameworks.Accelerate
|
||||||
|
];
|
||||||
|
};
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
155
ggml.c
155
ggml.c
|
@ -607,10 +607,11 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
|
||||||
assert(k % QK == 0);
|
assert(k % QK == 0);
|
||||||
|
|
||||||
const int nb = k / QK;
|
const int nb = k / QK;
|
||||||
|
const size_t bs = 2*sizeof(float) + QK/2;
|
||||||
|
|
||||||
float * restrict pm = (float *) (y);
|
uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
|
||||||
float * restrict pd = (float *) (pm + nb);
|
uint8_t * restrict pm = ((uint8_t *)y + 0*bs + sizeof(float));
|
||||||
uint8_t * restrict pb = (uint8_t *) (pd + nb);
|
uint8_t * restrict pb = ((uint8_t *)y + 0*bs + 2*sizeof(float));
|
||||||
|
|
||||||
uint8_t pp[QK/2];
|
uint8_t pp[QK/2];
|
||||||
|
|
||||||
|
@ -627,8 +628,10 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
|
||||||
const float d = (max - min) / ((1 << 4) - 1);
|
const float d = (max - min) / ((1 << 4) - 1);
|
||||||
const float id = d ? 1.0f/d : 0.0f;
|
const float id = d ? 1.0f/d : 0.0f;
|
||||||
|
|
||||||
pm[i] = min;
|
*(float *)pm = min;
|
||||||
pd[i] = d;
|
*(float *)pd = d;
|
||||||
|
pm += bs;
|
||||||
|
pd += bs;
|
||||||
|
|
||||||
for (int l = 0; l < QK; l += 2) {
|
for (int l = 0; l < QK; l += 2) {
|
||||||
const float v0 = (x[i*QK + l + 0] - min)*id;
|
const float v0 = (x[i*QK + l + 0] - min)*id;
|
||||||
|
@ -643,7 +646,8 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
|
||||||
pp[l/2] = vi0 | (vi1 << 4);
|
pp[l/2] = vi0 | (vi1 << 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(pb + i*QK/2, pp, sizeof(pp));
|
memcpy(pb, pp, sizeof(pp));
|
||||||
|
pb += bs;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -687,16 +691,17 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) {
|
||||||
assert(k % QK == 0);
|
assert(k % QK == 0);
|
||||||
|
|
||||||
const int nb = k / QK;
|
const int nb = k / QK;
|
||||||
|
const size_t bs = 2*sizeof(float) + QK/2;
|
||||||
|
|
||||||
const float * restrict pm = (const float *) (x);
|
const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
|
||||||
const float * restrict pd = (const float *) (pm + nb);
|
const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float));
|
||||||
const uint8_t * restrict pb = (const uint8_t *) (pd + nb);
|
const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
const float m = pm[i];
|
const float d = *(const float *) (pd + i*bs);
|
||||||
const float d = pd[i];
|
const float m = *(const float *) (pm + i*bs);
|
||||||
|
|
||||||
const uint8_t * restrict pp = pb + i*QK/2;
|
const uint8_t * restrict pp = pb + i*bs;
|
||||||
|
|
||||||
for (int l = 0; l < QK; l += 2) {
|
for (int l = 0; l < QK; l += 2) {
|
||||||
const uint8_t vi = pp[l/2];
|
const uint8_t vi = pp[l/2];
|
||||||
|
@ -1584,28 +1589,109 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
|
||||||
inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
|
inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
|
||||||
const int nb = n / QK;
|
const int nb = n / QK;
|
||||||
|
|
||||||
const float * restrict pm0 = (const float *) x;
|
const size_t bs = 2*sizeof(float) + QK/2;
|
||||||
const float * restrict pm1 = (const float *) y;
|
|
||||||
|
|
||||||
const float * restrict pd0 = (const float *) (pm0 + nb);
|
const uint8_t * restrict pd0 = ((const uint8_t *)x + 0*bs);
|
||||||
const float * restrict pd1 = (const float *) (pm1 + nb);
|
const uint8_t * restrict pd1 = ((const uint8_t *)y + 0*bs);
|
||||||
|
|
||||||
const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb);
|
const uint8_t * restrict pm0 = ((const uint8_t *)x + 0*bs + sizeof(float));
|
||||||
const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb);
|
const uint8_t * restrict pm1 = ((const uint8_t *)y + 0*bs + sizeof(float));
|
||||||
|
|
||||||
|
const uint8_t * restrict pb0 = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
|
||||||
|
const uint8_t * restrict pb1 = ((const uint8_t *)y + 0*bs + 2*sizeof(float));
|
||||||
|
|
||||||
float sumf = 0.0;
|
float sumf = 0.0;
|
||||||
|
|
||||||
#if 1
|
#if defined(__AVX2__)
|
||||||
|
#if QK == 32
|
||||||
|
// Initialize accumulator with zeros
|
||||||
|
__m256 acc = _mm256_setzero_ps();
|
||||||
|
// Accumulator for constant offsets
|
||||||
|
float acc_offset = 0.0f;
|
||||||
|
|
||||||
|
// Main loop
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const float * m0 = (const float *) (pm0 + i*bs);
|
||||||
|
const float * m1 = (const float *) (pm1 + i*bs);
|
||||||
|
|
||||||
|
const float * d0 = (const float *) (pd0 + i*bs);
|
||||||
|
const float * d1 = (const float *) (pd1 + i*bs);
|
||||||
|
|
||||||
|
const uint8_t * restrict p0 = pb0 + i*bs;
|
||||||
|
const uint8_t * restrict p1 = pb1 + i*bs;
|
||||||
|
|
||||||
|
const __m256 d0v = _mm256_broadcast_ss( d0 );
|
||||||
|
const __m256 d1v = _mm256_broadcast_ss( d1 );
|
||||||
|
const __m256 m0v = _mm256_broadcast_ss( m0 );
|
||||||
|
const __m256 m1v = _mm256_broadcast_ss( m1 );
|
||||||
|
|
||||||
|
|
||||||
|
// Compute combined scale for the block
|
||||||
|
const __m256 scale_01 = _mm256_mul_ps( d0v, d1v );
|
||||||
|
|
||||||
|
// Compute cross scales for the block
|
||||||
|
const __m256 scale_0 = _mm256_mul_ps( d0v, m1v );
|
||||||
|
const __m256 scale_1 = _mm256_mul_ps( m0v, d1v );
|
||||||
|
const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0b10101010 );
|
||||||
|
|
||||||
|
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
|
||||||
|
__m256i bx = bytesFromNibbles( p0 );
|
||||||
|
__m256i by = bytesFromNibbles( p1 );
|
||||||
|
|
||||||
|
// Now we have a vector with bytes in [ 0 .. 15 ] interval.
|
||||||
|
|
||||||
|
// Sign-extend first 16 signed bytes into int16_t
|
||||||
|
__m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) );
|
||||||
|
__m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) );
|
||||||
|
// Compute products of int16_t integers, add pairwise
|
||||||
|
__m256i i32 = _mm256_madd_epi16( x16, y16 );
|
||||||
|
|
||||||
|
// Sign-extend last 16 signed bytes into int16_t vectors
|
||||||
|
__m256i x16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) );
|
||||||
|
__m256i y16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) );
|
||||||
|
// Accumulate products of int16_t integers
|
||||||
|
i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16_h, y16_h ) );
|
||||||
|
|
||||||
|
// compute sums of unsigned bytes in bx, by in blocks of 8.
|
||||||
|
// This results in a layout like X100 0000 X200 0000 X300 0000 X400 0000,
|
||||||
|
// which we then interleave as X100 Y100 X200 Y200 X300 Y300 X400 Y400.
|
||||||
|
// so if we then cast to 8 singles, we get 8 floats like [ x0_7, y0_7, x8_15, y8_15, x16_23, y16_23, x24_31, y24_31 ]
|
||||||
|
__m256i xsumi = _mm256_sad_epu8( bx, _mm256_setzero_si256() );
|
||||||
|
__m256i ysumi = _mm256_sad_epu8( by, _mm256_setzero_si256() );
|
||||||
|
__m256i sumsi = _mm256_or_si256( xsumi, _mm256_slli_si256( ysumi, 4 ) );
|
||||||
|
__m256 sums = _mm256_cvtepi32_ps( sumsi );
|
||||||
|
|
||||||
|
// Convert int32_t to float
|
||||||
|
__m256 p = _mm256_cvtepi32_ps( i32 );
|
||||||
|
// Apply the scale, and accumulate
|
||||||
|
// acc += d0*d1*x*y + d0*m1*x + d1*m0*y
|
||||||
|
acc = _mm256_fmadd_ps( scale_01, p, acc );
|
||||||
|
acc = _mm256_fmadd_ps( cross_scales, sums, acc );
|
||||||
|
// acc_offset += m0*m1 (for each entry in the block)
|
||||||
|
acc_offset += (*m0)*(*m1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return horizontal sum of the acc vector
|
||||||
|
__m128 res = _mm256_extractf128_ps( acc, 1 );
|
||||||
|
res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
|
||||||
|
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
|
||||||
|
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
|
||||||
|
|
||||||
|
sumf = _mm_cvtss_f32( res ) + acc_offset * QK;
|
||||||
|
#else
|
||||||
|
#error "not implemented for QK"
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
// scalar
|
// scalar
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
const float m0 = pm0[i];
|
const float m0 = *(const float *) (pm0 + i*bs);
|
||||||
const float m1 = pm1[i];
|
const float m1 = *(const float *) (pm1 + i*bs);
|
||||||
|
|
||||||
const float d0 = pd0[i];
|
const float d0 = *(const float *) (pd0 + i*bs);
|
||||||
const float d1 = pd1[i];
|
const float d1 = *(const float *) (pd1 + i*bs);
|
||||||
|
|
||||||
const uint8_t * restrict p0 = pb0 + i*QK/2;
|
const uint8_t * restrict p0 = pb0 + i*bs;
|
||||||
const uint8_t * restrict p1 = pb1 + i*QK/2;
|
const uint8_t * restrict p1 = pb1 + i*bs;
|
||||||
|
|
||||||
for (int j = 0; j < QK/2; j++) {
|
for (int j = 0; j < QK/2; j++) {
|
||||||
const uint8_t v0 = p0[j];
|
const uint8_t v0 = p0[j];
|
||||||
|
@ -1839,16 +1925,17 @@ inline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * res
|
||||||
assert(n % QK == 0);
|
assert(n % QK == 0);
|
||||||
|
|
||||||
const int nb = n / QK;
|
const int nb = n / QK;
|
||||||
|
const size_t bs = 2*sizeof(float) + QK/2;
|
||||||
|
|
||||||
const float * restrict pm = (const float *) (x);
|
const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
|
||||||
const float * restrict pd = (const float *) (pm + nb);
|
const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float));
|
||||||
const uint8_t * restrict pb = (const uint8_t *) (pd + nb);
|
const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
const float m = pm[i];
|
const float d = *(const float *) (pd + i*bs);
|
||||||
const float d = pd[i];
|
const float m = *(const float *) (pm + i*bs);
|
||||||
|
|
||||||
const uint8_t * restrict pp = pb + i*QK/2;
|
const uint8_t * restrict pp = pb + i*bs;
|
||||||
|
|
||||||
for (int l = 0; l < QK; l += 2) {
|
for (int l = 0; l < QK; l += 2) {
|
||||||
const uint8_t vi = pp[l/2];
|
const uint8_t vi = pp[l/2];
|
||||||
|
@ -5469,7 +5556,7 @@ static void ggml_compute_forward_rms_norm_f32(
|
||||||
const size_t nb2 = dst->nb[2];
|
const size_t nb2 = dst->nb[2];
|
||||||
const size_t nb3 = dst->nb[3];
|
const size_t nb3 = dst->nb[3];
|
||||||
|
|
||||||
const ggml_float eps = 1e-5f; // TODO: make this a parameter
|
const ggml_float eps = 1e-6f; // TODO: make this a parameter
|
||||||
|
|
||||||
// TODO: optimize
|
// TODO: optimize
|
||||||
for (int i03 = 0; i03 < ne03; i03++) {
|
for (int i03 = 0; i03 < ne03; i03++) {
|
||||||
|
@ -9231,10 +9318,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
|
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
|
||||||
if (cgraph->n_threads <= 0) {
|
|
||||||
cgraph->n_threads = 8;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int n_threads = cgraph->n_threads;
|
const int n_threads = cgraph->n_threads;
|
||||||
|
|
||||||
struct ggml_compute_state_shared state_shared = {
|
struct ggml_compute_state_shared state_shared = {
|
||||||
|
|
91
main.cpp
91
main.cpp
|
@ -86,7 +86,7 @@ struct llama_model {
|
||||||
};
|
};
|
||||||
|
|
||||||
// load the model's weights from a file
|
// load the model's weights from a file
|
||||||
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
|
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, ggml_type memory_type = GGML_TYPE_F32) {
|
||||||
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||||
|
|
||||||
std::vector<char> f_buf(1024*1024);
|
std::vector<char> f_buf(1024*1024);
|
||||||
|
@ -143,16 +143,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
|
|
||||||
// load vocab
|
// load vocab
|
||||||
{
|
{
|
||||||
const int32_t n_vocab = model.hparams.n_vocab;
|
|
||||||
|
|
||||||
if (n_vocab != model.hparams.n_vocab) {
|
|
||||||
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
|
||||||
__func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string word;
|
std::string word;
|
||||||
for (int i = 0; i < n_vocab; i++) {
|
for (int i = 0; i < model.hparams.n_vocab; i++) {
|
||||||
uint32_t len;
|
uint32_t len;
|
||||||
fin.read((char *) &len, sizeof(len));
|
fin.read((char *) &len, sizeof(len));
|
||||||
|
|
||||||
|
@ -184,8 +176,6 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const ggml_type wtype2 = GGML_TYPE_F32;
|
|
||||||
|
|
||||||
auto & ctx = model.ctx;
|
auto & ctx = model.ctx;
|
||||||
|
|
||||||
size_t ctx_size = 0;
|
size_t ctx_size = 0;
|
||||||
|
@ -217,8 +207,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
|
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
|
||||||
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
|
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
|
||||||
|
|
||||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
|
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
|
||||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
|
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
|
||||||
|
|
||||||
ctx_size += (5 + 10*n_layer)*256; // object overhead
|
ctx_size += (5 + 10*n_layer)*256; // object overhead
|
||||||
|
|
||||||
|
@ -245,7 +235,6 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
|
|
||||||
const int n_embd = hparams.n_embd;
|
const int n_embd = hparams.n_embd;
|
||||||
const int n_layer = hparams.n_layer;
|
const int n_layer = hparams.n_layer;
|
||||||
const int n_ctx = hparams.n_ctx;
|
|
||||||
const int n_vocab = hparams.n_vocab;
|
const int n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
model.layers.resize(n_layer);
|
model.layers.resize(n_layer);
|
||||||
|
@ -304,8 +293,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
const int n_mem = n_layer*n_ctx;
|
const int n_mem = n_layer*n_ctx;
|
||||||
const int n_elements = n_embd*n_mem;
|
const int n_elements = n_embd*n_mem;
|
||||||
|
|
||||||
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
|
model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements);
|
||||||
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
|
model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements);
|
||||||
|
|
||||||
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
||||||
|
|
||||||
|
@ -547,8 +536,6 @@ bool llama_eval(
|
||||||
const int n_vocab = hparams.n_vocab;
|
const int n_vocab = hparams.n_vocab;
|
||||||
const int n_rot = hparams.n_embd/hparams.n_head;
|
const int n_rot = hparams.n_embd/hparams.n_head;
|
||||||
|
|
||||||
const int d_key = n_embd/n_head;
|
|
||||||
|
|
||||||
// TODO: check if this size scales with n_ctx linearly and remove constant. somehow I feel it wasn't the case
|
// TODO: check if this size scales with n_ctx linearly and remove constant. somehow I feel it wasn't the case
|
||||||
// static size_t buf_size = hparams.n_ctx*1024*1024;
|
// static size_t buf_size = hparams.n_ctx*1024*1024;
|
||||||
static size_t buf_size = 512u*1024*1024;
|
static size_t buf_size = 512u*1024*1024;
|
||||||
|
@ -801,6 +788,11 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (params.n_ctx > 2048) {
|
||||||
|
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
|
||||||
|
"expect poor results\n", __func__, params.n_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
if (params.seed < 0) {
|
if (params.seed < 0) {
|
||||||
params.seed = time(NULL);
|
params.seed = time(NULL);
|
||||||
}
|
}
|
||||||
|
@ -822,8 +814,9 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
{
|
{
|
||||||
|
const ggml_type memory_type = params.memory_f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||||
const int64_t t_start_us = ggml_time_us();
|
const int64_t t_start_us = ggml_time_us();
|
||||||
if (!llama_model_load(params.model, model, vocab, params.n_ctx)) {
|
if (!llama_model_load(params.model, model, vocab, params.n_ctx, memory_type)) {
|
||||||
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -845,14 +838,31 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::vector<float> logits;
|
std::vector<float> logits;
|
||||||
|
|
||||||
|
// Add a space in front of the first character to match OG llama tokenizer behavior
|
||||||
|
params.prompt.insert(0, 1, ' ');
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
|
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
|
||||||
|
|
||||||
params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
|
params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
|
||||||
|
|
||||||
|
// prefix & suffix for instruct mode
|
||||||
|
const std::vector<gpt_vocab::id> inp_pfx = ::llama_tokenize(vocab, "\n\n### Instruction:\n\n", true);
|
||||||
|
const std::vector<gpt_vocab::id> inp_sfx = ::llama_tokenize(vocab, "\n\n### Response:\n\n", false);
|
||||||
|
|
||||||
|
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||||
|
if (params.instruct) {
|
||||||
|
params.interactive = true;
|
||||||
|
params.antiprompt = "### Instruction:\n\n";
|
||||||
|
}
|
||||||
|
|
||||||
// tokenize the reverse prompt
|
// tokenize the reverse prompt
|
||||||
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
|
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
|
||||||
|
|
||||||
|
// enable interactive mode if reverse prompt is specified
|
||||||
|
if (!antiprompt_inp.empty()) {
|
||||||
|
params.interactive = true;
|
||||||
|
}
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
|
@ -873,7 +883,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
fprintf(stderr, "%s: interactive mode on.\n", __func__);
|
fprintf(stderr, "%s: interactive mode on.\n", __func__);
|
||||||
|
|
||||||
if(antiprompt_inp.size()) {
|
if (antiprompt_inp.size()) {
|
||||||
fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
|
fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
|
fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
|
||||||
for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
|
for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
|
||||||
|
@ -895,31 +905,27 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<gpt_vocab::id> last_n_tokens(last_n_size);
|
std::vector<gpt_vocab::id> last_n_tokens(last_n_size);
|
||||||
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
|
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
|
||||||
|
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
fprintf(stderr, "== Running in interactive mode. ==\n"
|
fprintf(stderr, "== Running in interactive mode. ==\n"
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||||
" - Press Ctrl+C to interject at any time.\n"
|
" - Press Ctrl+C to interject at any time.\n"
|
||||||
#endif
|
#endif
|
||||||
" - Press Return to return control to LLaMa.\n"
|
" - Press Return to return control to LLaMa.\n"
|
||||||
" - If you want to submit another line, end your input in '\\'.\n");
|
" - If you want to submit another line, end your input in '\\'.\n\n");
|
||||||
|
is_interacting = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
int remaining_tokens = params.n_predict;
|
|
||||||
int input_consumed = 0;
|
int input_consumed = 0;
|
||||||
bool input_noecho = false;
|
bool input_noecho = false;
|
||||||
|
|
||||||
// prompt user immediately after the starting prompt has been loaded
|
int remaining_tokens = params.n_predict;
|
||||||
if (params.interactive_start) {
|
|
||||||
is_interacting = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// set the color for the prompt which will be output initially
|
// set the color for the prompt which will be output initially
|
||||||
if (params.use_color) {
|
if (params.use_color) {
|
||||||
printf(ANSI_COLOR_YELLOW);
|
printf(ANSI_COLOR_YELLOW);
|
||||||
}
|
}
|
||||||
|
|
||||||
while (remaining_tokens > 0) {
|
while (remaining_tokens > 0 || params.interactive) {
|
||||||
// predict
|
// predict
|
||||||
if (embd.size() > 0) {
|
if (embd.size() > 0) {
|
||||||
const int64_t t_start_us = ggml_time_us();
|
const int64_t t_start_us = ggml_time_us();
|
||||||
|
@ -972,13 +978,13 @@ int main(int argc, char ** argv) {
|
||||||
last_n_tokens.erase(last_n_tokens.begin());
|
last_n_tokens.erase(last_n_tokens.begin());
|
||||||
last_n_tokens.push_back(embd_inp[input_consumed]);
|
last_n_tokens.push_back(embd_inp[input_consumed]);
|
||||||
++input_consumed;
|
++input_consumed;
|
||||||
if (embd.size() > params.n_batch) {
|
if ((int) embd.size() >= params.n_batch) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// reset color to default if we there is no pending user input
|
// reset color to default if we there is no pending user input
|
||||||
if (!input_noecho && params.use_color && embd_inp.size() == input_consumed) {
|
if (!input_noecho && params.use_color && (int) embd_inp.size() == input_consumed) {
|
||||||
printf(ANSI_COLOR_RESET);
|
printf(ANSI_COLOR_RESET);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1000,19 +1006,26 @@ int main(int argc, char ** argv) {
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
}
|
}
|
||||||
if (is_interacting) {
|
if (is_interacting) {
|
||||||
|
if (params.instruct) {
|
||||||
|
input_consumed = embd_inp.size();
|
||||||
|
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
||||||
|
|
||||||
|
printf("\n> ");
|
||||||
|
}
|
||||||
|
|
||||||
// currently being interactive
|
// currently being interactive
|
||||||
bool another_line=true;
|
bool another_line = true;
|
||||||
while (another_line) {
|
while (another_line) {
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
char buf[256] = {0};
|
char buf[256] = {0};
|
||||||
int n_read;
|
int n_read;
|
||||||
if(params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN);
|
if (params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN);
|
||||||
if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) {
|
if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) {
|
||||||
// presumable empty line, consume the newline
|
// presumable empty line, consume the newline
|
||||||
scanf("%*c");
|
std::ignore = scanf("%*c");
|
||||||
n_read=0;
|
n_read=0;
|
||||||
}
|
}
|
||||||
if(params.use_color) printf(ANSI_COLOR_RESET);
|
if (params.use_color) printf(ANSI_COLOR_RESET);
|
||||||
|
|
||||||
if (n_read > 0 && buf[n_read-1]=='\\') {
|
if (n_read > 0 && buf[n_read-1]=='\\') {
|
||||||
another_line = true;
|
another_line = true;
|
||||||
|
@ -1027,6 +1040,10 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
|
std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
|
||||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||||
|
|
||||||
|
if (params.instruct) {
|
||||||
|
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||||
|
}
|
||||||
|
|
||||||
remaining_tokens -= line_inp.size();
|
remaining_tokens -= line_inp.size();
|
||||||
|
|
||||||
input_noecho = true; // do not echo this again
|
input_noecho = true; // do not echo this again
|
||||||
|
@ -1038,10 +1055,14 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// end of text token
|
// end of text token
|
||||||
if (embd.back() == 2) {
|
if (embd.back() == 2) {
|
||||||
|
if (params.interactive) {
|
||||||
|
is_interacting = true;
|
||||||
|
} else {
|
||||||
fprintf(stderr, " [end of text]\n");
|
fprintf(stderr, " [end of text]\n");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#if defined (_WIN32)
|
#if defined (_WIN32)
|
||||||
signal(SIGINT, SIG_DFL);
|
signal(SIGINT, SIG_DFL);
|
||||||
|
|
1
prompts/alpaca.txt
Normal file
1
prompts/alpaca.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
7
prompts/chat-with-bob.txt
Normal file
7
prompts/chat-with-bob.txt
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
||||||
|
|
||||||
|
User: Hello, Bob.
|
||||||
|
Bob: Hello. How may I help you today?
|
||||||
|
User: Please tell me the largest city in Europe.
|
||||||
|
Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
|
||||||
|
User:
|
122
utils.cpp
122
utils.cpp
|
@ -16,6 +16,18 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
|
// determine sensible default number of threads.
|
||||||
|
// std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
|
||||||
|
#ifdef __linux__
|
||||||
|
std::ifstream cpuinfo("/proc/cpuinfo");
|
||||||
|
params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
|
||||||
|
std::istream_iterator<std::string>(),
|
||||||
|
std::string("processor"));
|
||||||
|
#endif
|
||||||
|
if (params.n_threads == 0) {
|
||||||
|
params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
std::string arg = argv[i];
|
std::string arg = argv[i];
|
||||||
|
|
||||||
|
@ -26,19 +38,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
} else if (arg == "-p" || arg == "--prompt") {
|
} else if (arg == "-p" || arg == "--prompt") {
|
||||||
params.prompt = argv[++i];
|
params.prompt = argv[++i];
|
||||||
} else if (arg == "-f" || arg == "--file") {
|
} else if (arg == "-f" || arg == "--file") {
|
||||||
|
|
||||||
std::ifstream file(argv[++i]);
|
std::ifstream file(argv[++i]);
|
||||||
|
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
||||||
std::copy(std::istreambuf_iterator<char>(file),
|
if (params.prompt.back() == '\n') {
|
||||||
std::istreambuf_iterator<char>(),
|
params.prompt.pop_back();
|
||||||
back_inserter(params.prompt));
|
}
|
||||||
|
|
||||||
} else if (arg == "-n" || arg == "--n_predict") {
|
} else if (arg == "-n" || arg == "--n_predict") {
|
||||||
params.n_predict = std::stoi(argv[++i]);
|
params.n_predict = std::stoi(argv[++i]);
|
||||||
} else if (arg == "--top_k") {
|
} else if (arg == "--top_k") {
|
||||||
params.top_k = std::stoi(argv[++i]);
|
params.top_k = std::stoi(argv[++i]);
|
||||||
} else if (arg == "-c" || arg == "--ctx_size") {
|
} else if (arg == "-c" || arg == "--ctx_size") {
|
||||||
params.n_ctx = std::stoi(argv[++i]);
|
params.n_ctx = std::stoi(argv[++i]);
|
||||||
|
} else if (arg == "--memory_f16") {
|
||||||
|
params.memory_f16 = true;
|
||||||
} else if (arg == "--top_p") {
|
} else if (arg == "--top_p") {
|
||||||
params.top_p = std::stof(argv[++i]);
|
params.top_p = std::stof(argv[++i]);
|
||||||
} else if (arg == "--temp") {
|
} else if (arg == "--temp") {
|
||||||
|
@ -53,9 +65,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
params.model = argv[++i];
|
params.model = argv[++i];
|
||||||
} else if (arg == "-i" || arg == "--interactive") {
|
} else if (arg == "-i" || arg == "--interactive") {
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
} else if (arg == "--interactive-start") {
|
} else if (arg == "-ins" || arg == "--instruct") {
|
||||||
params.interactive = true;
|
params.instruct = true;
|
||||||
params.interactive_start = true;
|
|
||||||
} else if (arg == "--color") {
|
} else if (arg == "--color") {
|
||||||
params.use_color = true;
|
params.use_color = true;
|
||||||
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
||||||
|
@ -73,13 +84,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_print_usage(int argc, char ** argv, const gpt_params & params) {
|
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "options:\n");
|
fprintf(stderr, "options:\n");
|
||||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||||
fprintf(stderr, " -i, --interactive run in interactive mode\n");
|
fprintf(stderr, " -i, --interactive run in interactive mode\n");
|
||||||
fprintf(stderr, " --interactive-start run in interactive mode and poll user input at startup\n");
|
fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||||
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
|
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
|
||||||
fprintf(stderr, " in interactive mode, poll user input upon seeing PROMPT\n");
|
fprintf(stderr, " in interactive mode, poll user input upon seeing PROMPT\n");
|
||||||
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
|
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
|
||||||
|
@ -95,6 +106,7 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
|
fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
|
||||||
fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty);
|
fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty);
|
||||||
fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx);
|
fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
|
fprintf(stderr, " --memory_f16 use f16 instead of f32 for memory key+value\n");
|
||||||
fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
|
fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
|
||||||
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
||||||
|
@ -275,40 +287,56 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
|
||||||
return tokens;
|
return tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: Calculate this constant from the vocabulary
|
||||||
|
#define MAX_TOKEN_LEN 18
|
||||||
|
// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
|
||||||
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
|
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
|
||||||
//auto res = gpt_tokenize(vocab, text);
|
|
||||||
|
|
||||||
//if (bos) {
|
|
||||||
// res.insert(res.begin(), 1); // TODO: replace with vocab.bos
|
|
||||||
//}
|
|
||||||
|
|
||||||
std::vector<gpt_vocab::id> res;
|
std::vector<gpt_vocab::id> res;
|
||||||
|
std::vector<int> score;
|
||||||
|
std::vector<gpt_vocab::id> prev;
|
||||||
|
int len = text.length();
|
||||||
|
|
||||||
|
score.resize(len + 1);
|
||||||
|
prev.resize(len + 1);
|
||||||
|
|
||||||
|
// Forward pass
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
int max_len = std::min(len - i, MAX_TOKEN_LEN);
|
||||||
|
for (int sub_len = 1; sub_len <= max_len; sub_len++) {
|
||||||
|
auto sub = text.substr(i, sub_len);
|
||||||
|
auto token = vocab.token_to_id.find(sub);
|
||||||
|
if (token != vocab.token_to_id.end()) {
|
||||||
|
int token_score = sub.length() * sub.length();
|
||||||
|
int local_score = score[i] + token_score;
|
||||||
|
int next = i + sub_len;
|
||||||
|
if (score[next] < local_score) {
|
||||||
|
score[next] = local_score;
|
||||||
|
prev[next] = (*token).second;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Backward pass
|
||||||
|
int i = len;
|
||||||
|
while (i > 0) {
|
||||||
|
gpt_vocab::id token_id = prev[i];
|
||||||
|
if (token_id == 0) {
|
||||||
|
// TODO: Return error or something more meaningful
|
||||||
|
printf("failed to tokenize string!\n");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
res.push_back(token_id);
|
||||||
|
auto token = (*vocab.id_to_token.find(token_id)).second;
|
||||||
|
i -= token.length();
|
||||||
|
}
|
||||||
|
|
||||||
if (bos) {
|
if (bos) {
|
||||||
res.push_back(1); // TODO: replace with vocab.bos
|
res.push_back(1); // TODO: replace with vocab.bos
|
||||||
}
|
}
|
||||||
|
|
||||||
//find the longest token that matches the text
|
// Pieces are in reverse order so correct that
|
||||||
int pos = 0;
|
std::reverse(res.begin(), res.end());
|
||||||
while (true) {
|
|
||||||
int l = 0;
|
|
||||||
int t = 0;
|
|
||||||
for (const auto & kv : vocab.id_to_token) {
|
|
||||||
if (kv.second.size() < l) continue;
|
|
||||||
if (kv.second.size() > text.size() - pos) continue;
|
|
||||||
if (text.substr(pos, kv.second.size()) == kv.second) {
|
|
||||||
l = kv.second.size();
|
|
||||||
t = kv.first;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (l == 0) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
res.push_back(t);
|
|
||||||
pos += l;
|
|
||||||
}
|
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
@ -489,7 +517,8 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t
|
||||||
|
|
||||||
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
|
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
|
||||||
const int nb = k / qk;
|
const int nb = k / qk;
|
||||||
const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2);
|
const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
|
||||||
|
const size_t row_size = nb*bs;
|
||||||
|
|
||||||
assert(k % qk == 0);
|
assert(k % qk == 0);
|
||||||
|
|
||||||
|
@ -499,9 +528,9 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
|
||||||
char * pdst = (char *) dst;
|
char * pdst = (char *) dst;
|
||||||
|
|
||||||
for (int j = 0; j < n; j += k) {
|
for (int j = 0; j < n; j += k) {
|
||||||
float * pm = (float *) (pdst + (j/k)*row_size);
|
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
|
||||||
float * pd = (float *) (pm + nb);
|
uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
|
||||||
uint8_t * pb = (uint8_t *) (pd + nb);
|
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
|
||||||
|
|
||||||
//printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
|
//printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
|
||||||
|
|
||||||
|
@ -519,8 +548,10 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
|
||||||
const float d = (max - min) / ((1 << 4) - 1);
|
const float d = (max - min) / ((1 << 4) - 1);
|
||||||
const float id = d ? 1.0f/d : 0.0f;
|
const float id = d ? 1.0f/d : 0.0f;
|
||||||
|
|
||||||
pm[i] = min;
|
*(float *) pd = d;
|
||||||
pd[i] = d;
|
*(float *) pm = min;
|
||||||
|
pd += bs;
|
||||||
|
pm += bs;
|
||||||
|
|
||||||
for (int l = 0; l < qk; l += 2) {
|
for (int l = 0; l < qk; l += 2) {
|
||||||
const float v0 = (src[j + i*qk + l + 0] - min)*id;
|
const float v0 = (src[j + i*qk + l + 0] - min)*id;
|
||||||
|
@ -538,7 +569,8 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
|
||||||
pp[l/2] = vi0 | (vi1 << 4);
|
pp[l/2] = vi0 | (vi1 << 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(pb + i*qk/2, pp, pp_size);
|
memcpy(pb, pp, pp_size);
|
||||||
|
pb += bs;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
7
utils.h
7
utils.h
|
@ -18,6 +18,7 @@ struct gpt_params {
|
||||||
int32_t n_predict = 128; // new tokens to predict
|
int32_t n_predict = 128; // new tokens to predict
|
||||||
int32_t repeat_last_n = 64; // last n tokens to penalize
|
int32_t repeat_last_n = 64; // last n tokens to penalize
|
||||||
int32_t n_ctx = 512; //context size
|
int32_t n_ctx = 512; //context size
|
||||||
|
bool memory_f16 = false; // use f16 instead of f32 for memory kv
|
||||||
|
|
||||||
// sampling parameters
|
// sampling parameters
|
||||||
int32_t top_k = 40;
|
int32_t top_k = 40;
|
||||||
|
@ -28,13 +29,13 @@ struct gpt_params {
|
||||||
int32_t n_batch = 8; // batch size for prompt processing
|
int32_t n_batch = 8; // batch size for prompt processing
|
||||||
|
|
||||||
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
|
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
|
||||||
std::string prompt;
|
std::string prompt = "";
|
||||||
|
std::string antiprompt = ""; // string upon seeing which more user input is prompted
|
||||||
|
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
|
|
||||||
bool interactive = false; // interactive mode
|
bool interactive = false; // interactive mode
|
||||||
bool interactive_start = false; // reverse prompt immediately
|
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||||
std::string antiprompt = ""; // string upon seeing which more user input is prompted
|
|
||||||
};
|
};
|
||||||
|
|
||||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue