Merge branch 'master' into gg/flash-attn
This commit is contained in:
commit
2c41180e88
110 changed files with 11660 additions and 6357 deletions
20
.github/workflows/bench.yml
vendored
20
.github/workflows/bench.yml
vendored
|
@ -79,12 +79,18 @@ jobs:
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
done
|
done
|
||||||
|
|
||||||
- name: Install k6
|
- name: Set up Go
|
||||||
|
uses: actions/setup-go@v5
|
||||||
|
with:
|
||||||
|
go-version: '1.21'
|
||||||
|
|
||||||
|
- name: Install k6 and xk6-sse
|
||||||
id: k6_installation
|
id: k6_installation
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/bench
|
cd examples/server/bench
|
||||||
wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
|
go install go.k6.io/xk6/cmd/xk6@latest
|
||||||
tar xzf k6*.tar.gz --strip-components=1
|
xk6 build master \
|
||||||
|
--with github.com/phymbert/xk6-sse
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
|
@ -118,7 +124,7 @@ jobs:
|
||||||
|
|
||||||
cd examples/server/bench
|
cd examples/server/bench
|
||||||
source venv/bin/activate
|
source venv/bin/activate
|
||||||
BENCH_K6_BIN_PATH=./k6 python bench.py \
|
python bench.py \
|
||||||
--runner-label ${{ env.RUNNER_LABEL }} \
|
--runner-label ${{ env.RUNNER_LABEL }} \
|
||||||
--name ${{ github.job }} \
|
--name ${{ github.job }} \
|
||||||
--branch ${{ github.head_ref || github.ref_name }} \
|
--branch ${{ github.head_ref || github.ref_name }} \
|
||||||
|
@ -228,9 +234,9 @@ jobs:
|
||||||
<summary>Expand details for performance related PR only</summary>
|
<summary>Expand details for performance related PR only</summary>
|
||||||
|
|
||||||
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
|
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
|
||||||
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
|
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
|
||||||
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
|
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
|
||||||
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
|
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
|
||||||
- ${{ env.BENCH_GRAPH_XLABEL }}
|
- ${{ env.BENCH_GRAPH_XLABEL }}
|
||||||
|
|
||||||
|
|
||||||
|
|
22
.github/workflows/build.yml
vendored
22
.github/workflows/build.yml
vendored
|
@ -52,7 +52,7 @@ jobs:
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -L main --verbose --timeout 900
|
ctest -L 'main|curl' --verbose --timeout 900
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
id: tag
|
id: tag
|
||||||
|
@ -101,7 +101,9 @@ jobs:
|
||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
|
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||||
|
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||||
|
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -209,21 +211,21 @@ jobs:
|
||||||
id: depends
|
id: depends
|
||||||
run: |
|
run: |
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install build-essential
|
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON
|
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -L main --verbose --timeout 900
|
ctest -L 'main|curl' --verbose --timeout 900
|
||||||
|
|
||||||
- name: Test llama2c conversion
|
- name: Test llama2c conversion
|
||||||
id: llama2c_test
|
id: llama2c_test
|
||||||
|
@ -938,6 +940,12 @@ jobs:
|
||||||
- name: Download artifacts
|
- name: Download artifacts
|
||||||
id: download-artifact
|
id: download-artifact
|
||||||
uses: actions/download-artifact@v4
|
uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
path: ./artifact
|
||||||
|
|
||||||
|
- name: Move artifacts
|
||||||
|
id: move_artifacts
|
||||||
|
run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
|
||||||
|
|
||||||
- name: Create release
|
- name: Create release
|
||||||
id: create_release
|
id: create_release
|
||||||
|
@ -956,7 +964,7 @@ jobs:
|
||||||
const path = require('path');
|
const path = require('path');
|
||||||
const fs = require('fs');
|
const fs = require('fs');
|
||||||
const release_id = '${{ steps.create_release.outputs.id }}';
|
const release_id = '${{ steps.create_release.outputs.id }}';
|
||||||
for (let file of await fs.readdirSync('./artifact')) {
|
for (let file of await fs.readdirSync('./artifact/release')) {
|
||||||
if (path.extname(file) === '.zip') {
|
if (path.extname(file) === '.zip') {
|
||||||
console.log('uploadReleaseAsset', file);
|
console.log('uploadReleaseAsset', file);
|
||||||
await github.repos.uploadReleaseAsset({
|
await github.repos.uploadReleaseAsset({
|
||||||
|
@ -964,7 +972,7 @@ jobs:
|
||||||
repo: context.repo.repo,
|
repo: context.repo.repo,
|
||||||
release_id: release_id,
|
release_id: release_id,
|
||||||
name: file,
|
name: file,
|
||||||
data: await fs.readFileSync(`./artifact/${file}`)
|
data: await fs.readFileSync(`./artifact/release/${file}`)
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
10
.github/workflows/docker.yml
vendored
10
.github/workflows/docker.yml
vendored
|
@ -91,6 +91,12 @@ jobs:
|
||||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
- name: Downcase github.repository_owner
|
||||||
|
run: |
|
||||||
|
echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
|
||||||
|
env:
|
||||||
|
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||||
|
|
||||||
- name: Build and push Docker image (versioned)
|
- name: Build and push Docker image (versioned)
|
||||||
if: github.event_name == 'push'
|
if: github.event_name == 'push'
|
||||||
uses: docker/build-push-action@v4
|
uses: docker/build-push-action@v4
|
||||||
|
@ -98,7 +104,7 @@ jobs:
|
||||||
context: .
|
context: .
|
||||||
push: true
|
push: true
|
||||||
platforms: ${{ matrix.config.platforms }}
|
platforms: ${{ matrix.config.platforms }}
|
||||||
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
|
||||||
- name: Build and push Docker image (tagged)
|
- name: Build and push Docker image (tagged)
|
||||||
|
@ -107,5 +113,5 @@ jobs:
|
||||||
context: .
|
context: .
|
||||||
push: ${{ github.event_name == 'push' }}
|
push: ${{ github.event_name == 'push' }}
|
||||||
platforms: ${{ matrix.config.platforms }}
|
platforms: ${{ matrix.config.platforms }}
|
||||||
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -48,6 +48,7 @@ models-mnt
|
||||||
/convert-llama2c-to-ggml
|
/convert-llama2c-to-ggml
|
||||||
/embd-input-test
|
/embd-input-test
|
||||||
/embedding
|
/embedding
|
||||||
|
/eval-callback
|
||||||
/gguf
|
/gguf
|
||||||
/gguf-llama-simple
|
/gguf-llama-simple
|
||||||
/gguf-split
|
/gguf-split
|
||||||
|
|
655
AUTHORS
Normal file
655
AUTHORS
Normal file
|
@ -0,0 +1,655 @@
|
||||||
|
# date: Tue Apr 9 09:17:14 EEST 2024
|
||||||
|
# this file is auto-generated by scripts/gen-authors.sh
|
||||||
|
|
||||||
|
0cc4m <picard12@live.de>
|
||||||
|
0xspringtime <110655352+0xspringtime@users.noreply.github.com>
|
||||||
|
2f38b454 <dxf@protonmail.com>
|
||||||
|
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
|
||||||
|
44670 <44670@users.noreply.github.com>
|
||||||
|
AN Long <aisk@users.noreply.github.com>
|
||||||
|
AT <manyoso@users.noreply.github.com>
|
||||||
|
Aarni Koskela <akx@iki.fi>
|
||||||
|
Aaron Miller <apage43@ninjawhale.com>
|
||||||
|
Aaryaman Vasishta <aaryaman.vasishta@amd.com>
|
||||||
|
Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
|
||||||
|
Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com>
|
||||||
|
Adithya Balaji <adithya.b94@gmail.com>
|
||||||
|
AdithyanI <adithyan.i4internet@gmail.com>
|
||||||
|
Adrian <smith.adriane@gmail.com>
|
||||||
|
Adrian Hesketh <a-h@users.noreply.github.com>
|
||||||
|
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
|
||||||
|
Aisuko <urakiny@gmail.com>
|
||||||
|
Alberto <57916483+albbus-stack@users.noreply.github.com>
|
||||||
|
Alex <awhill19@icloud.com>
|
||||||
|
Alex Azarov <alex@azarov.by>
|
||||||
|
Alex Azarov <alexander.azarov@mapbox.com>
|
||||||
|
Alex Klinkhamer <from.github.com.917@grencez.dev>
|
||||||
|
Alex Klinkhamer <git@grencez.dev>
|
||||||
|
Alex Nguyen <tiendung@users.noreply.github.com>
|
||||||
|
Alex Petenchea <alex.petenchea@gmail.com>
|
||||||
|
Alex Renda <alexrenda@users.noreply.github.com>
|
||||||
|
Alex von Gluck IV <kallisti5@unixzen.com>
|
||||||
|
Alexey Parfenov <zxed@alkatrazstudio.net>
|
||||||
|
Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
|
||||||
|
Ali Nehzat <ali.nehzat@thanks.dev>
|
||||||
|
Ali Tariq <ali.tariq@10xengineers.ai>
|
||||||
|
Alon <alonfaraj@gmail.com>
|
||||||
|
AlpinDale <52078762+AlpinDale@users.noreply.github.com>
|
||||||
|
AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
|
||||||
|
Ananta Bastola <anantarajbastola@gmail.com>
|
||||||
|
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
|
||||||
|
András Salamon <ott2@users.noreply.github.com>
|
||||||
|
Andrei <abetlen@gmail.com>
|
||||||
|
Andrew Canis <andrew.canis@gmail.com>
|
||||||
|
Andrew Duffy <a10y@users.noreply.github.com>
|
||||||
|
Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
|
||||||
|
Arik Poznanski <arikpoz@users.noreply.github.com>
|
||||||
|
Artem <guinmoon@gmail.com>
|
||||||
|
Artyom Lebedev <vagran.ast@gmail.com>
|
||||||
|
Asbjørn Olling <asbjornolling@gmail.com>
|
||||||
|
Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
|
||||||
|
Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
|
||||||
|
Ashraful Islam <ashraful.meche@gmail.com>
|
||||||
|
Atsushi Tatsuma <yoshoku@outlook.com>
|
||||||
|
Austin <77757836+teleprint-me@users.noreply.github.com>
|
||||||
|
AustinMroz <austinmroz@utexas.edu>
|
||||||
|
BADR <contact@pythops.com>
|
||||||
|
Bach Le <bach@bullno1.com>
|
||||||
|
Bailey Chittle <39804642+bachittle@users.noreply.github.com>
|
||||||
|
BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
|
||||||
|
Behnam M <58621210+ibehnam@users.noreply.github.com>
|
||||||
|
Ben Garney <bengarney@users.noreply.github.com>
|
||||||
|
Ben Siraphob <bensiraphob@gmail.com>
|
||||||
|
Ben Williams <ben@719ben.com>
|
||||||
|
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
|
||||||
|
Bernat Vadell <hounter.caza@gmail.com>
|
||||||
|
Bodo Graumann <mail@bodograumann.de>
|
||||||
|
Bono Lv <lvscar@users.noreply.github.com>
|
||||||
|
Borislav Stanimirov <b.stanimirov@abv.bg>
|
||||||
|
Branden Butler <bwtbutler@hotmail.com>
|
||||||
|
Brian <mofosyne@gmail.com>
|
||||||
|
Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||||
|
CJ Pais <cj@cjpais.com>
|
||||||
|
CRD716 <crd716@gmail.com>
|
||||||
|
Cameron <csteele@steelecameron.com>
|
||||||
|
Cameron Kaiser <classilla@users.noreply.github.com>
|
||||||
|
Casey Primozic <casey@cprimozic.net>
|
||||||
|
Casey Primozic <me@ameo.link>
|
||||||
|
CausalLM <148736309+CausalLM@users.noreply.github.com>
|
||||||
|
Cebtenzzre <cebtenzzre@gmail.com>
|
||||||
|
Chad Brewbaker <crb002@gmail.com>
|
||||||
|
Cheng Shao <terrorjack@type.dance>
|
||||||
|
Chris Kuehl <ckuehl@ckuehl.me>
|
||||||
|
Christian Demsar <christian@github.email.demsar.us>
|
||||||
|
Christian Demsar <crasm@git.vczf.us>
|
||||||
|
Christian Falch <875252+chrfalch@users.noreply.github.com>
|
||||||
|
Christian Kögler <ck3d@gmx.de>
|
||||||
|
Clark Saben <76020733+csaben@users.noreply.github.com>
|
||||||
|
Clint Herron <hanclinto@gmail.com>
|
||||||
|
Cuong Trinh Manh <nguoithichkhampha@gmail.com>
|
||||||
|
DAN™ <dranger003@gmail.com>
|
||||||
|
Damian Stewart <d@damianstewart.com>
|
||||||
|
Dane Madsen <dane_madsen@hotmail.com>
|
||||||
|
DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
|
||||||
|
Daniel Bevenius <daniel.bevenius@gmail.com>
|
||||||
|
Daniel Drake <drake@endlessos.org>
|
||||||
|
Daniel Hiltgen <dhiltgen@users.noreply.github.com>
|
||||||
|
Daniel Illescas Romero <illescas.daniel@protonmail.com>
|
||||||
|
DannyDaemonic <DannyDaemonic@gmail.com>
|
||||||
|
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
|
||||||
|
Dave Della Costa <ddellacosta+github@gmail.com>
|
||||||
|
David Friehs <david@friehs.info>
|
||||||
|
David Kennedy <dakennedyd@gmail.com>
|
||||||
|
David Pflug <david@pflug.email>
|
||||||
|
David Renshaw <dwrenshaw@gmail.com>
|
||||||
|
David Sommers <12738+databyte@users.noreply.github.com>
|
||||||
|
David Yang <davidyang6us@gmail.com>
|
||||||
|
Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
|
||||||
|
Dean <Dean.Sinaean@gmail.com>
|
||||||
|
Deins <deinsegle@gmail.com>
|
||||||
|
Didzis Gosko <didzis@users.noreply.github.com>
|
||||||
|
Don Mahurin <dmahurin@users.noreply.github.com>
|
||||||
|
DooWoong Lee (David) <manics99@naver.com>
|
||||||
|
Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
|
||||||
|
Douglas Hanley <thesecretaryofwar@gmail.com>
|
||||||
|
Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
|
||||||
|
Ebey Abraham <ebey97@gmail.com>
|
||||||
|
Ed Lee <edilee@mozilla.com>
|
||||||
|
Ed Lepedus <ed.lepedus@googlemail.com>
|
||||||
|
Edward Taylor <edeetee@gmail.com>
|
||||||
|
Elbios <141279586+Elbios@users.noreply.github.com>
|
||||||
|
Engininja2 <139037756+Engininja2@users.noreply.github.com>
|
||||||
|
Equim <sayaka@ekyu.moe>
|
||||||
|
Eric Sommerlade <es0m@users.noreply.github.com>
|
||||||
|
Eric Zhang <34133756+EZForever@users.noreply.github.com>
|
||||||
|
Erik Garrison <erik.garrison@gmail.com>
|
||||||
|
Erik Scholz <Green-Sky@users.noreply.github.com>
|
||||||
|
Ettore Di Giacinto <mudler@users.noreply.github.com>
|
||||||
|
Evan Jones <evan.q.jones@gmail.com>
|
||||||
|
Evan Miller <emmiller@gmail.com>
|
||||||
|
Eve <139727413+netrunnereve@users.noreply.github.com>
|
||||||
|
Evgeny Kurnevsky <kurnevsky@gmail.com>
|
||||||
|
Ewout ter Hoeven <E.M.terHoeven@student.tudelft.nl>
|
||||||
|
ExtReMLapin <3909752+ExtReMLapin@users.noreply.github.com>
|
||||||
|
FK <sozforex@gmail.com>
|
||||||
|
Fabian <cmdrf@users.noreply.github.com>
|
||||||
|
Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
|
||||||
|
Faez Shakil <faez.shakil@gmail.com>
|
||||||
|
FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
|
||||||
|
Fattire <528174+fat-tire@users.noreply.github.com>
|
||||||
|
Felix <stenbackfelix@gmail.com>
|
||||||
|
Finn Voorhees <finnvoorhees@gmail.com>
|
||||||
|
Firat <firatkiral@gmail.com>
|
||||||
|
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
|
||||||
|
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
|
||||||
|
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
|
||||||
|
FrankHB <frankhb1989@gmail.com>
|
||||||
|
Frederik Vogel <Schaltfehler@users.noreply.github.com>
|
||||||
|
Gabe Goodhart <gabe.l.hart@gmail.com>
|
||||||
|
GainLee <perfecter.gen@gmail.com>
|
||||||
|
Galunid <karolek1231456@gmail.com>
|
||||||
|
Gary Linscott <glinscott@gmail.com>
|
||||||
|
Gary Mulder <gjmulder@gmail.com>
|
||||||
|
Genkagaku.GPT <hlhr202@163.com>
|
||||||
|
Georgi Gerganov <ggerganov@gmail.com>
|
||||||
|
Gilad S <giladgd@users.noreply.github.com>
|
||||||
|
GiviMAD <GiviMAD@users.noreply.github.com>
|
||||||
|
Govlzkoy <gotope@users.noreply.github.com>
|
||||||
|
Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
|
||||||
|
Guillaume Wenzek <gwenzek@users.noreply.github.com>
|
||||||
|
Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
|
||||||
|
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
|
||||||
|
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
|
||||||
|
Haohui Mai <ricetons@gmail.com>
|
||||||
|
Haoxiang Fei <tonyfettes@tonyfettes.com>
|
||||||
|
Harald Fernengel <harald.fernengel@here.com>
|
||||||
|
Hatsune Miku <129688334+at8u@users.noreply.github.com>
|
||||||
|
Henk Poley <HenkPoley@gmail.com>
|
||||||
|
Henri Vasserman <henv@hot.ee>
|
||||||
|
Henrik Forstén <henrik.forsten@gmail.com>
|
||||||
|
Herman Semenov <GermanAizek@yandex.ru>
|
||||||
|
Hesen Peng <hesen.peng@gmail.com>
|
||||||
|
Hoang Nguyen <hugo53@users.noreply.github.com>
|
||||||
|
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
|
||||||
|
Howard Su <howard0su@gmail.com>
|
||||||
|
Hua Jiang <allenhjiang@outlook.com>
|
||||||
|
Huawei Lin <huaweilin.cs@gmail.com>
|
||||||
|
Ian Bull <irbull@eclipsesource.com>
|
||||||
|
Ian Bull <irbull@gmail.com>
|
||||||
|
Ian Scrivener <github@zilogy.asia>
|
||||||
|
Ido S <ido.pluto@gmail.com>
|
||||||
|
IgnacioFDM <ignaciofdm@gmail.com>
|
||||||
|
Igor Okulist <okigan@gmail.com>
|
||||||
|
Ikko Eltociear Ashimine <eltociear@gmail.com>
|
||||||
|
Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
|
||||||
|
Ionoclast Laboratories <brigham@ionoclast.com>
|
||||||
|
Isaac McFadyen <isaac@imcf.me>
|
||||||
|
IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
|
||||||
|
Ivan Komarov <Ivan.Komarov@dfyz.info>
|
||||||
|
Ivan Stepanov <ivanstepanovftw@gmail.com>
|
||||||
|
JH23X <165871467+JH23X@users.noreply.github.com>
|
||||||
|
Jack Mousseau <jmousseau@users.noreply.github.com>
|
||||||
|
JackJollimore <130917767+JackJollimore@users.noreply.github.com>
|
||||||
|
Jag Chadha <jagtesh@gmail.com>
|
||||||
|
Jakub N <jakubniemczyk97@gmail.com>
|
||||||
|
James Reynolds <magnusviri@users.noreply.github.com>
|
||||||
|
Jan Boon <jan.boon@kaetemi.be>
|
||||||
|
Jan Boon <kaetemi@gmail.com>
|
||||||
|
Jan Ploski <jpl@plosquare.com>
|
||||||
|
Jannis Schönleber <joennlae@gmail.com>
|
||||||
|
Jared Van Bortel <cebtenzzre@gmail.com>
|
||||||
|
Jared Van Bortel <jared@nomic.ai>
|
||||||
|
Jason McCartney <jmac@theroot.org>
|
||||||
|
Jean-Christophe Hoelt <hoelt@fovea.cc>
|
||||||
|
Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
|
||||||
|
Jed Fox <git@jedfox.com>
|
||||||
|
Jeffrey Quesnelle <emozilla@nousresearch.com>
|
||||||
|
Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
|
||||||
|
Jhen-Jie Hong <iainst0409@gmail.com>
|
||||||
|
Jiahao Li <liplus17@163.com>
|
||||||
|
Jian Liao <jianliao@users.noreply.github.com>
|
||||||
|
JidongZhang-THU <1119708529@qq.com>
|
||||||
|
Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
|
||||||
|
Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
|
||||||
|
Johannes Gäßler <johannesg@5d6.de>
|
||||||
|
Johannes Rudolph <johannes.rudolph@gmail.com>
|
||||||
|
John <78893154+cmp-nct@users.noreply.github.com>
|
||||||
|
John Balis <phobossystems@gmail.com>
|
||||||
|
John Smith <67539080+kingsidelee@users.noreply.github.com>
|
||||||
|
JohnnyB <jboero@users.noreply.github.com>
|
||||||
|
Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com>
|
||||||
|
Jorge A <161275481+jorgealias@users.noreply.github.com>
|
||||||
|
Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com>
|
||||||
|
Joseph Stahl <1269177+josephst@users.noreply.github.com>
|
||||||
|
Joyce <joycebrum@google.com>
|
||||||
|
Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
|
||||||
|
Judd <foldl@users.noreply.github.com>
|
||||||
|
Julius Arkenberg <arki05@users.noreply.github.com>
|
||||||
|
Jun Jie <71215065+junnjiee16@users.noreply.github.com>
|
||||||
|
Juraj Bednar <juraj@bednar.io>
|
||||||
|
Justin Parker <jparkerweb@gmail.com>
|
||||||
|
Justin Suess <justin.suess@westpoint.edu>
|
||||||
|
Justine Tunney <jtunney@gmail.com>
|
||||||
|
Juuso Alasuutari <juuso.alasuutari@gmail.com>
|
||||||
|
KASR <karim.asrih@gmail.com>
|
||||||
|
Kamil Tomšík <info@tomsik.cz>
|
||||||
|
Karsten Weiss <knweiss@gmail.com>
|
||||||
|
Karthick <j.karthic2004@gmail.com>
|
||||||
|
Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com>
|
||||||
|
Karthik Sethuraman <k.seth1993@gmail.com>
|
||||||
|
Kasumi <90275229+kasumi-1@users.noreply.github.com>
|
||||||
|
Kawrakow <48489457+ikawrakow@users.noreply.github.com>
|
||||||
|
Keiichi Tabata <keiichi.tabata@outlook.com>
|
||||||
|
Kenvix ⭐ <kenvixzure@live.com>
|
||||||
|
Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
|
||||||
|
Kevin Ji <1146876+kevinji@users.noreply.github.com>
|
||||||
|
Kevin Kwok <antimatter15@gmail.com>
|
||||||
|
Kevin Lo <kevlo@kevlo.org>
|
||||||
|
Kolen Cheung <ickc@users.noreply.github.com>
|
||||||
|
Konstantin Herud <konstantin.herud@denkbares.com>
|
||||||
|
Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
|
||||||
|
Kunshang Ji <kunshang.ji@intel.com>
|
||||||
|
Kyle Liang <liangmanlai@gmail.com>
|
||||||
|
Kyle Mistele <kyle@mistele.com>
|
||||||
|
Kylin <56434533+KyL0N@users.noreply.github.com>
|
||||||
|
Lars Grammel <lars.grammel@gmail.com>
|
||||||
|
Laura <Tijntje_7@msn.com>
|
||||||
|
Lee <44310445+lx200916@users.noreply.github.com>
|
||||||
|
Lee Drake <b.lee.drake@gmail.com>
|
||||||
|
Leng Yue <lengyue@lengyue.me>
|
||||||
|
LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
|
||||||
|
Leonardo Neumann <leonardo@neumann.dev.br>
|
||||||
|
Li Tan <tanliboy@gmail.com>
|
||||||
|
Linwei Wang <wanix1988@gmail.com>
|
||||||
|
LoganDark <github@logandark.mozmail.com>
|
||||||
|
LostRuins <39025047+LostRuins@users.noreply.github.com>
|
||||||
|
Luciano <lucianostrika44@gmail.com>
|
||||||
|
Luo Tian <lt@basecity.com>
|
||||||
|
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
|
||||||
|
Maarten ter Huurne <maarten@treewalker.org>
|
||||||
|
Mack Straight <eiz@users.noreply.github.com>
|
||||||
|
Maël Kerbiriou <m431.kerbiriou@gmail.com>
|
||||||
|
MaggotHATE <clay1326@gmail.com>
|
||||||
|
Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
|
||||||
|
Marco Matthies <71844+marcom@users.noreply.github.com>
|
||||||
|
Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
|
||||||
|
Marian Cepok <marian.cepok@gmail.com>
|
||||||
|
Mark Fairbairn <thebaron88@gmail.com>
|
||||||
|
Marko Tasic <mtasic85@gmail.com>
|
||||||
|
Martin Krasser <krasserm@googlemail.com>
|
||||||
|
Martin Schwaighofer <mschwaig@users.noreply.github.com>
|
||||||
|
Marvin Gießing <marvin.giessing@gmail.com>
|
||||||
|
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
|
||||||
|
Matheus C. França <matheus-catarino@hotmail.com>
|
||||||
|
Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
|
||||||
|
Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
|
||||||
|
Mathijs de Bruin <mathijs@mathijsfietst.nl>
|
||||||
|
Matt Clayton <156335168+mattjcly@users.noreply.github.com>
|
||||||
|
Matt Pulver <matt.pulver@heavy.ai>
|
||||||
|
Matteo Boschini <12133566+mbosc@users.noreply.github.com>
|
||||||
|
Matthew Tejo <matthew.tejo@gmail.com>
|
||||||
|
Matvey Soloviev <blackhole89@gmail.com>
|
||||||
|
Maxime <672982+maximegmd@users.noreply.github.com>
|
||||||
|
Maximilian Winter <maximilian.winter.91@gmail.com>
|
||||||
|
Meng Zhang <meng@tabbyml.com>
|
||||||
|
Meng, Hengyu <hengyu.meng@intel.com>
|
||||||
|
Merrick Christensen <merrick.christensen@gmail.com>
|
||||||
|
Michael Coppola <m18coppola@gmail.com>
|
||||||
|
Michael Hueschen <m@mhueschen.dev>
|
||||||
|
Michael Kesper <mkesper@schokokeks.org>
|
||||||
|
Michael Klimenko <mklimenko29@gmail.com>
|
||||||
|
Michael Podvitskiy <podvitskiymichael@gmail.com>
|
||||||
|
Michael Potter <NanoTekGuy@Gmail.com>
|
||||||
|
Michaël de Vries <vriesdemichael@gmail.com>
|
||||||
|
Mihai <mihai.chirculescu@yahoo.com>
|
||||||
|
Mike <ytianhui2004@gmail.com>
|
||||||
|
Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
|
||||||
|
Mirko185 <mirkosig@gmail.com>
|
||||||
|
Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
|
||||||
|
Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
|
||||||
|
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
|
||||||
|
Murilo Santana <mvrilo@gmail.com>
|
||||||
|
Musab Gultekin <musabgultekin@users.noreply.github.com>
|
||||||
|
Nam D. Tran <42194884+namtranase@users.noreply.github.com>
|
||||||
|
NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
|
||||||
|
Nebula <infinitewormhole@gmail.com>
|
||||||
|
Neo Zhang Jianyu <jianyu.zhang@intel.com>
|
||||||
|
Neuman Vong <neuman.vong@gmail.com>
|
||||||
|
Nexesenex <124105151+Nexesenex@users.noreply.github.com>
|
||||||
|
Niall Coates <1349685+Niall-@users.noreply.github.com>
|
||||||
|
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
|
||||||
|
Nigel Bosch <pnigelb@gmail.com>
|
||||||
|
Niklas Korz <niklas@niklaskorz.de>
|
||||||
|
Nindaleth <Nindaleth@users.noreply.github.com>
|
||||||
|
Oleksandr Nikitin <oleksandr@tvori.info>
|
||||||
|
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
|
||||||
|
Olivier Chafik <ochafik@users.noreply.github.com>
|
||||||
|
Ondřej Čertík <ondrej@certik.us>
|
||||||
|
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
|
||||||
|
Paul Tsochantaris <ptsochantaris@icloud.com>
|
||||||
|
Pavol Rusnak <pavol@rusnak.io>
|
||||||
|
Pedro Cuenca <pedro@huggingface.co>
|
||||||
|
Peter Sugihara <peter@campsh.com>
|
||||||
|
Phil H <5756783+phiharri@users.noreply.github.com>
|
||||||
|
Philip Taron <philip.taron@gmail.com>
|
||||||
|
Phillip Kravtsov <phillip@kravtsov.net>
|
||||||
|
Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
|
||||||
|
Pierrick Hymbert <pierrick.hymbert@gmail.com>
|
||||||
|
Przemysław Pawełczyk <przemoc@gmail.com>
|
||||||
|
Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
|
||||||
|
Qingyou Meng <meng.qingyou@gmail.com>
|
||||||
|
Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
|
||||||
|
RJ Adriaansen <adriaansen@eshcc.eur.nl>
|
||||||
|
Radoslav Gerganov <rgerganov@gmail.com>
|
||||||
|
Radosław Gryta <radek.gryta@gmail.com>
|
||||||
|
Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com>
|
||||||
|
Rand Xie <randxiexyy29@gmail.com>
|
||||||
|
Randall Fitzgerald <randall@dasaku.net>
|
||||||
|
Reinforce-II <fate@eastal.com>
|
||||||
|
Riceball LEE <snowyu.lee@gmail.com>
|
||||||
|
Richard Kiss <him@richardkiss.com>
|
||||||
|
Richard Roberson <richardr1126@gmail.com>
|
||||||
|
Rick G <26732651+TheFlipbook@users.noreply.github.com>
|
||||||
|
Rickard Edén <rickardeden@gmail.com>
|
||||||
|
Rickard Hallerbäck <rickard.hallerback@gmail.com>
|
||||||
|
Rickey Bowers Jr <bitRAKE@gmail.com>
|
||||||
|
Riley Stewart <ristew@users.noreply.github.com>
|
||||||
|
Rinne <AsakusaRinne@gmail.com>
|
||||||
|
Rinne <liu_yaohui1998@126.com>
|
||||||
|
Robert Brisita <986796+rbrisita@users.noreply.github.com>
|
||||||
|
Robert Sung-wook Shin <edp1096@users.noreply.github.com>
|
||||||
|
Robey Holderith <robey@flaminglunchbox.net>
|
||||||
|
Robyn <robyngraf@users.noreply.github.com>
|
||||||
|
Roger Meier <r.meier@siemens.com>
|
||||||
|
Roland <14355895+rbur0425@users.noreply.github.com>
|
||||||
|
Romain D <90720+Artefact2@users.noreply.github.com>
|
||||||
|
Romain Neutron <romain@neutron.io>
|
||||||
|
Roman Parykin <donderom@gmail.com>
|
||||||
|
Ron Evans <ron@hybridgroup.com>
|
||||||
|
Ron Jailall <rojailal@gmail.com>
|
||||||
|
Ronny Brendel <ronnybrendel@gmail.com>
|
||||||
|
Ronsor <ronsor@ronsor.pw>
|
||||||
|
Rowan Hart <rowanbhart@gmail.com>
|
||||||
|
Rune <43761327+Rune-AI@users.noreply.github.com>
|
||||||
|
Ryan Landay <rlanday@gmail.com>
|
||||||
|
Ryder Wishart <ryderwishart@gmail.com>
|
||||||
|
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
|
||||||
|
SakuraUmi <yukinon244@gmail.com>
|
||||||
|
Salvador E. Tropea <stropea@inti.gob.ar>
|
||||||
|
Sam Spilsbury <smspillaz@gmail.com>
|
||||||
|
Sami Farin <3876865+Safari77@users.noreply.github.com>
|
||||||
|
Samuel Maynard <samwmaynard@gmail.com>
|
||||||
|
Sang-Kil Park <sang.park@42dot.ai>
|
||||||
|
Seb C <47074056+Sebby37@users.noreply.github.com>
|
||||||
|
Sebastián A <sebastian.aedo29@gmail.com>
|
||||||
|
SebastianApel <13675545+SebastianApel@users.noreply.github.com>
|
||||||
|
Senemu <10880819+Senemu@users.noreply.github.com>
|
||||||
|
Sergey Alirzaev <zl29ah@gmail.com>
|
||||||
|
Sergio López <slp@sinrega.org>
|
||||||
|
SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
|
||||||
|
ShadovvBeast <ShadovvBeast@gmail.com>
|
||||||
|
Shakhar Dasgupta <shakhardasgupta@gmail.com>
|
||||||
|
Shangning Xu <32517059+xushangning@users.noreply.github.com>
|
||||||
|
Shijie <821898965@qq.com>
|
||||||
|
Shintarou Okada <kokuzen@gmail.com>
|
||||||
|
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
|
||||||
|
Shouzheng Liu <lshzh.hi@gmail.com>
|
||||||
|
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
|
||||||
|
Simon Willison <swillison@gmail.com>
|
||||||
|
Siwen Yu <yusiwen@gmail.com>
|
||||||
|
Sky Yan <skyan83@gmail.com>
|
||||||
|
Slaren <2141330+slaren@users.noreply.github.com>
|
||||||
|
Slava Primenko <primenko.s@gmail.com>
|
||||||
|
SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
|
||||||
|
Someone <sergei.kozlukov@aalto.fi>
|
||||||
|
Someone Serge <sergei.kozlukov@aalto.fi>
|
||||||
|
Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
|
||||||
|
Spencer Sutton <spencersutton@users.noreply.github.com>
|
||||||
|
Srinivas Billa <nivibilla@gmail.com>
|
||||||
|
Stefan Sydow <stefan@sydow.email>
|
||||||
|
Stephan Walter <stephan@walter.name>
|
||||||
|
Stephen Nichols <snichols@users.noreply.github.com>
|
||||||
|
Steve Grubb <ausearch.1@gmail.com>
|
||||||
|
Steven Roussey <sroussey@gmail.com>
|
||||||
|
Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
|
||||||
|
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
|
||||||
|
SuperUserNameMan <yoann@terminajones.com>
|
||||||
|
Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
|
||||||
|
Taikono-Himazin <kazu@po.harenet.ne.jp>
|
||||||
|
Tameem <113388789+AhmadTameem@users.noreply.github.com>
|
||||||
|
Tamotsu Takahashi <ttakah+github@gmail.com>
|
||||||
|
Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com>
|
||||||
|
Thatcher Chamberlin <j.thatcher.c@gmail.com>
|
||||||
|
Theia Vogel <theia@vgel.me>
|
||||||
|
Thérence <13496987+Royalphax@users.noreply.github.com>
|
||||||
|
Thibault Terrasson <thibault.terrasson@gmail.com>
|
||||||
|
Thomas Klausner <wiz@gatalith.at>
|
||||||
|
Tim Miller <drasticactions@users.noreply.github.com>
|
||||||
|
Timmy Knight <r2d2fish@gmail.com>
|
||||||
|
Timothy Cronin <40186632+4imothy@users.noreply.github.com>
|
||||||
|
Ting Lou <ting.lou@gmail.com>
|
||||||
|
Ting Sun <suntcrick@gmail.com>
|
||||||
|
Tobias Lütke <tobi@shopify.com>
|
||||||
|
Tom C <tom.corelis@gmail.com>
|
||||||
|
Tom Jobbins <784313+TheBloke@users.noreply.github.com>
|
||||||
|
Tomas <tom.tomas.36478119@gmail.com>
|
||||||
|
Tomáš Pazdiora <tomas.pazdiora@gmail.com>
|
||||||
|
Tristan Ross <rosscomputerguy@protonmail.com>
|
||||||
|
Tungsten842 <886724vf@anonaddy.me>
|
||||||
|
Tungsten842 <quantmint@protonmail.com>
|
||||||
|
Tushar <ditsuke@protonmail.com>
|
||||||
|
UEXTM.com <84163508+uextm@users.noreply.github.com>
|
||||||
|
Uzo Nweke <uzoechi@gmail.com>
|
||||||
|
Vaibhav Srivastav <vaibhavs10@gmail.com>
|
||||||
|
Val Kharitonov <mail@kharvd.com>
|
||||||
|
Valentin Konovalov <valle.ketsujin@gmail.com>
|
||||||
|
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
|
||||||
|
Victor Z. Peng <ziliangdotme@gmail.com>
|
||||||
|
Vlad <spitfireage@gmail.com>
|
||||||
|
Vladimir <bogdad@gmail.com>
|
||||||
|
Vladimir Malyutin <first-leon@yandex.ru>
|
||||||
|
Vladimir Zorin <vladimir@deviant.guru>
|
||||||
|
Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
|
||||||
|
WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
|
||||||
|
Weird Constructor <weirdconstructor@gmail.com>
|
||||||
|
Welby Seely <welbyseely@gmail.com>
|
||||||
|
Wentai Zhang <rchardx@gmail.com>
|
||||||
|
WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
|
||||||
|
Willy Tarreau <w@1wt.eu>
|
||||||
|
Wu Jian Ping <wujjpp@hotmail.com>
|
||||||
|
Wu Jian Ping <wujp@greatld.com>
|
||||||
|
Xiake Sun <xiake.sun@intel.com>
|
||||||
|
Xiang (Kevin) Li <kevinli020508@gmail.com>
|
||||||
|
Xiao-Yong Jin <jinxiaoyong@gmail.com>
|
||||||
|
XiaotaoChen <chenxiaotao1234@gmail.com>
|
||||||
|
Xiaoyi Chen <cxychina@gmail.com>
|
||||||
|
Xingchen Song(宋星辰) <xingchensong1996@163.com>
|
||||||
|
Xuan Son Nguyen <thichthat@gmail.com>
|
||||||
|
Yann Follet <131855179+YannFollet@users.noreply.github.com>
|
||||||
|
Yiming Cui <conandiy@vip.qq.com>
|
||||||
|
Yishuo Wang <MeouSker77@outlook.com>
|
||||||
|
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
|
||||||
|
Yui <dev@sleepyyui.com>
|
||||||
|
Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
|
||||||
|
Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
|
||||||
|
ZHAOKAI WANG <sanxianwei@163.com>
|
||||||
|
Zane Shannon <z@zcs.me>
|
||||||
|
Zay <95888118+isaiahbjork@users.noreply.github.com>
|
||||||
|
Zenix <zenixls2@gmail.com>
|
||||||
|
Zhang Peiyuan <a1286225768@gmail.com>
|
||||||
|
ZhouYuChen <zhouyuchen@naver.com>
|
||||||
|
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
|
||||||
|
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
|
||||||
|
Zsapi <martin1.zsapka@gmail.com>
|
||||||
|
a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
|
||||||
|
adel boussaken <netdur@gmail.com>
|
||||||
|
afrideva <95653597+afrideva@users.noreply.github.com>
|
||||||
|
akawrykow <142945436+akawrykow@users.noreply.github.com>
|
||||||
|
alexpinel <93524949+alexpinel@users.noreply.github.com>
|
||||||
|
alonfaraj <alonfaraj@gmail.com>
|
||||||
|
andrijdavid <david@geek.mg>
|
||||||
|
anon998 <131767832+anon998@users.noreply.github.com>
|
||||||
|
anzz1 <anzz1@live.com>
|
||||||
|
apaz <aarpazdera@gmail.com>
|
||||||
|
apcameron <37645737+apcameron@users.noreply.github.com>
|
||||||
|
arcrank <arcrank@gmail.com>
|
||||||
|
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
|
||||||
|
at8u <129688334+at8u@users.noreply.github.com>
|
||||||
|
automaticcat <daogiatuank54@gmail.com>
|
||||||
|
bandoti <141645996+bandoti@users.noreply.github.com>
|
||||||
|
beiller <beiller@gmail.com>
|
||||||
|
bhubbb <79117352+bhubbb@users.noreply.github.com>
|
||||||
|
bmwl <brian.marshall@tolko.com>
|
||||||
|
bobqianic <129547291+bobqianic@users.noreply.github.com>
|
||||||
|
bryanSwk <93190252+bryanSwk@users.noreply.github.com>
|
||||||
|
bsilvereagle <bsilvereagle@users.noreply.github.com>
|
||||||
|
bssrdf <merlintiger@hotmail.com>
|
||||||
|
byte-6174 <88070277+byte-6174@users.noreply.github.com>
|
||||||
|
cebtenzzre <cebtenzzre@gmail.com>
|
||||||
|
chaihahaha <chai836275709@gmail.com>
|
||||||
|
chiranko <96988916+chiranko@users.noreply.github.com>
|
||||||
|
clibdev <52199778+clibdev@users.noreply.github.com>
|
||||||
|
clyang <clyang@clyang.net>
|
||||||
|
cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
|
||||||
|
coezbek <c.oezbek@gmail.com>
|
||||||
|
comex <comexk@gmail.com>
|
||||||
|
compilade <113953597+compilade@users.noreply.github.com>
|
||||||
|
crasm <crasm@git.vczf.net>
|
||||||
|
crasm <crasm@git.vczf.us>
|
||||||
|
daboe01 <daboe01@googlemail.com>
|
||||||
|
david raistrick <keen99@users.noreply.github.com>
|
||||||
|
ddpasa <112642920+ddpasa@users.noreply.github.com>
|
||||||
|
deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
|
||||||
|
divinity76 <divinity76@gmail.com>
|
||||||
|
dotpy314 <33351922+dotpy314@users.noreply.github.com>
|
||||||
|
drbh <david.richard.holtz@gmail.com>
|
||||||
|
ds5t5 <145942675+ds5t5@users.noreply.github.com>
|
||||||
|
dylan <canardleteer@users.noreply.github.com>
|
||||||
|
eastriver <lee@eastriver.dev>
|
||||||
|
ebraminio <ebraminio@gmail.com>
|
||||||
|
eiery <19350831+eiery@users.noreply.github.com>
|
||||||
|
eric8607242 <e0928021388@gmail.com>
|
||||||
|
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
|
||||||
|
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
|
||||||
|
gliptic <gliptic@users.noreply.github.com>
|
||||||
|
goerch <jhr.walter@t-online.de>
|
||||||
|
grahameth <96447521+grahameth@users.noreply.github.com>
|
||||||
|
gwjr <502526+gwjr@users.noreply.github.com>
|
||||||
|
h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
|
||||||
|
hankcs <cnhankmc@gmail.com>
|
||||||
|
hoangmit <hoangmit@users.noreply.github.com>
|
||||||
|
hongbo.mo <352280764@qq.com>
|
||||||
|
howlger <eclipse@voormann.de>
|
||||||
|
howlger <github@voormann.de>
|
||||||
|
hutli <6594598+hutli@users.noreply.github.com>
|
||||||
|
hutli <hutli@hutli.hu>
|
||||||
|
hutli <jensstaermose@hotmail.com>
|
||||||
|
hxer7963 <hxer7963@gmail.com>
|
||||||
|
hydai <z54981220@gmail.com>
|
||||||
|
iSma <ismail.senhaji@gmail.com>
|
||||||
|
iacore <74560659+iacore@users.noreply.github.com>
|
||||||
|
igarnier <igarnier@protonmail.com>
|
||||||
|
iohub <rickyang.pro@gmail.com>
|
||||||
|
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
|
||||||
|
jameswu2014 <545426914@qq.com>
|
||||||
|
jneem <joeneeman@gmail.com>
|
||||||
|
johnson442 <56517414+johnson442@users.noreply.github.com>
|
||||||
|
jon-chuang <9093549+jon-chuang@users.noreply.github.com>
|
||||||
|
jp-x-g <jpxg-dev@protonmail.com>
|
||||||
|
jwj7140 <32943891+jwj7140@users.noreply.github.com>
|
||||||
|
kaizau <kaizau@users.noreply.github.com>
|
||||||
|
kalomaze <66376113+kalomaze@users.noreply.github.com>
|
||||||
|
kang <tpdns9032100@gmail.com>
|
||||||
|
katsu560 <118887472+katsu560@users.noreply.github.com>
|
||||||
|
kchro3 <62481661+kchro3@users.noreply.github.com>
|
||||||
|
khimaros <me@khimaros.com>
|
||||||
|
kiltyj <kiltyj@gmail.com>
|
||||||
|
klosax <131523366+klosax@users.noreply.github.com>
|
||||||
|
kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
|
||||||
|
kunnis <kunnis@users.noreply.github.com>
|
||||||
|
kuronekosaiko <EvanChanJ@163.com>
|
||||||
|
kuvaus <22169537+kuvaus@users.noreply.github.com>
|
||||||
|
kwin1412 <42286931+kwin1412@users.noreply.github.com>
|
||||||
|
l3utterfly <gc.pthzfoldr@gmail.com>
|
||||||
|
ldwang <ftgreat@163.com>
|
||||||
|
le.chang <cljs118@126.com>
|
||||||
|
leejet <leejet714@gmail.com>
|
||||||
|
limitedAtonement <limitedAtonement@users.noreply.github.com>
|
||||||
|
lon <114724657+longregen@users.noreply.github.com>
|
||||||
|
m3ndax <adrian.goessl@outlook.com>
|
||||||
|
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
|
||||||
|
makomk <makosoft@googlemail.com>
|
||||||
|
manikbhandari <mbbhandarimanik2@gmail.com>
|
||||||
|
mdrokz <mohammadmunshi@gmail.com>
|
||||||
|
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
|
||||||
|
minarchist <minarchist@users.noreply.github.com>
|
||||||
|
mj-shifu <77107165+mj-shifu@users.noreply.github.com>
|
||||||
|
mmyjona <jonathan.gonse@gmail.com>
|
||||||
|
momonga <115213907+mmnga@users.noreply.github.com>
|
||||||
|
moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
|
||||||
|
mzcu <milos.cubrilo@gmail.com>
|
||||||
|
nanahi <130121847+na-na-hi@users.noreply.github.com>
|
||||||
|
ngc92 <7938269+ngc92@users.noreply.github.com>
|
||||||
|
nhamanasu <45545786+nhamanasu@users.noreply.github.com>
|
||||||
|
niansa/tuxifan <anton-sa@web.de>
|
||||||
|
niansa/tuxifan <tuxifan@posteo.de>
|
||||||
|
ningshanwutuobang <ningshanwutuobang@gmail.com>
|
||||||
|
nold <Nold360@users.noreply.github.com>
|
||||||
|
nopperl <54780682+nopperl@users.noreply.github.com>
|
||||||
|
nusu-github <29514220+nusu-github@users.noreply.github.com>
|
||||||
|
olexiyb <olexiyb@gmail.com>
|
||||||
|
oobabooga <112222186+oobabooga@users.noreply.github.com>
|
||||||
|
opparco <parco.opaai@gmail.com>
|
||||||
|
ostix360 <55257054+ostix360@users.noreply.github.com>
|
||||||
|
perserk <perserk@gmail.com>
|
||||||
|
postmasters <namnguyen@google.com>
|
||||||
|
pudepiedj <pudepiedj@gmail.com>
|
||||||
|
qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
|
||||||
|
qouoq <qouoq@fastmail.com>
|
||||||
|
qunash <anzoria@gmail.com>
|
||||||
|
rabidcopy <rabidcopy@yahoo.com>
|
||||||
|
rankaiyx <rankaiyx@rankaiyx.com>
|
||||||
|
rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com>
|
||||||
|
rhuddleston <ryan.huddleston@percona.com>
|
||||||
|
rimoliga <53384203+rimoliga@users.noreply.github.com>
|
||||||
|
runfuture <runfuture@users.noreply.github.com>
|
||||||
|
sandyiscool <sandyiscool@gmail.com>
|
||||||
|
semidark <me@semidark.net>
|
||||||
|
sharpHL <132747147+sharpHL@users.noreply.github.com>
|
||||||
|
shibe2 <shibe@tuta.io>
|
||||||
|
singularity <12184989+singularity-s0@users.noreply.github.com>
|
||||||
|
sjinzh <sjinzh@gmail.com>
|
||||||
|
slaren <2141330+slaren@users.noreply.github.com>
|
||||||
|
slaren <slarengh@gmail.com>
|
||||||
|
snadampal <87143774+snadampal@users.noreply.github.com>
|
||||||
|
staviq <staviq@gmail.com>
|
||||||
|
stduhpf <stephduh@live.fr>
|
||||||
|
swittk <switt1995@gmail.com>
|
||||||
|
takov751 <40316768+takov751@users.noreply.github.com>
|
||||||
|
tarcey <cey.tarik@gmail.com>
|
||||||
|
texmex76 <40733439+texmex76@users.noreply.github.com>
|
||||||
|
thement <40525767+thement@users.noreply.github.com>
|
||||||
|
tjohnman <tjohnman@users.noreply.github.com>
|
||||||
|
tslmy <tslmy@users.noreply.github.com>
|
||||||
|
ubik2 <ubik2@users.noreply.github.com>
|
||||||
|
uint256_t <konndennsa@gmail.com>
|
||||||
|
uint256_t <maekawatoshiki1017@gmail.com>
|
||||||
|
unbounded <haakon@likedan.net>
|
||||||
|
valiray <133289098+valiray@users.noreply.github.com>
|
||||||
|
vodkaslime <646329483@qq.com>
|
||||||
|
vvhg1 <94630311+vvhg1@users.noreply.github.com>
|
||||||
|
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
|
||||||
|
wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
|
||||||
|
whoreson <139810751+whoreson@users.noreply.github.com>
|
||||||
|
wonjun Jang <strutive07@gmail.com>
|
||||||
|
wzy <32936898+Freed-Wu@users.noreply.github.com>
|
||||||
|
xaedes <xaedes@gmail.com>
|
||||||
|
xaedes <xaedes@googlemail.com>
|
||||||
|
xloem <0xloem@gmail.com>
|
||||||
|
yangli2 <yangli2@gmail.com>
|
||||||
|
yuiseki <yuiseki@gmail.com>
|
||||||
|
zakkor <edward.partenie@gmail.com>
|
||||||
|
zhouwg <6889919+zhouwg@users.noreply.github.com>
|
||||||
|
zrm <trustiosity.zrm@gmail.com>
|
||||||
|
源文雨 <41315874+fumiama@users.noreply.github.com>
|
||||||
|
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
|
|
@ -88,6 +88,7 @@ endif()
|
||||||
# 3rd party libs
|
# 3rd party libs
|
||||||
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
||||||
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
||||||
|
option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ON)
|
||||||
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||||
option(LLAMA_CUDA "llama: use CUDA" OFF)
|
option(LLAMA_CUDA "llama: use CUDA" OFF)
|
||||||
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
|
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
|
||||||
|
@ -286,6 +287,7 @@ if (LLAMA_METAL)
|
||||||
${METALKIT_FRAMEWORK}
|
${METALKIT_FRAMEWORK}
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_BLAS)
|
if (LLAMA_BLAS)
|
||||||
if (LLAMA_STATIC)
|
if (LLAMA_STATIC)
|
||||||
set(BLA_STATIC ON)
|
set(BLA_STATIC ON)
|
||||||
|
@ -368,6 +370,10 @@ if (LLAMA_BLAS)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_LLAMAFILE)
|
||||||
|
add_compile_definitions(GGML_USE_LLAMAFILE)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (LLAMA_QKK_64)
|
if (LLAMA_QKK_64)
|
||||||
add_compile_definitions(GGML_QKK_64)
|
add_compile_definitions(GGML_QKK_64)
|
||||||
endif()
|
endif()
|
||||||
|
@ -1151,6 +1157,8 @@ add_library(ggml OBJECT
|
||||||
ggml-backend.h
|
ggml-backend.h
|
||||||
ggml-quants.c
|
ggml-quants.c
|
||||||
ggml-quants.h
|
ggml-quants.h
|
||||||
|
sgemm.cpp
|
||||||
|
sgemm.h
|
||||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||||
|
|
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
||||||
MIT License
|
MIT License
|
||||||
|
|
||||||
Copyright (c) 2023 Georgi Gerganov
|
Copyright (c) 2023-2024 The ggml authors
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
|
32
Makefile
32
Makefile
|
@ -1,7 +1,7 @@
|
||||||
# Define the default target now so that it is always the first target
|
# Define the default target now so that it is always the first target
|
||||||
BUILD_TARGETS = \
|
BUILD_TARGETS = \
|
||||||
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||||
simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
|
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
|
||||||
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
|
@ -10,7 +10,7 @@ TEST_TARGETS = \
|
||||||
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
||||||
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
||||||
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
|
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
|
||||||
tests/test-json-schema-to-grammar
|
tests/test-json-schema-to-grammar tests/test-grammar-integration
|
||||||
|
|
||||||
# Code coverage output files
|
# Code coverage output files
|
||||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||||
|
@ -219,6 +219,13 @@ ifdef LLAMA_DISABLE_LOGS
|
||||||
MK_CPPFLAGS += -DLOG_DISABLE_LOGS
|
MK_CPPFLAGS += -DLOG_DISABLE_LOGS
|
||||||
endif # LLAMA_DISABLE_LOGS
|
endif # LLAMA_DISABLE_LOGS
|
||||||
|
|
||||||
|
# disable ggml.c's use of sgemm.cpp
|
||||||
|
ifdef LLAMA_NO_LLAMAFILE
|
||||||
|
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE=0
|
||||||
|
else
|
||||||
|
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE=1
|
||||||
|
endif
|
||||||
|
|
||||||
# warnings
|
# warnings
|
||||||
WARN_FLAGS = -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
|
WARN_FLAGS = -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
|
||||||
MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int \
|
MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int \
|
||||||
|
@ -646,7 +653,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])'
|
||||||
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
||||||
ifndef CUDA_DOCKER_ARCH
|
ifndef CUDA_DOCKER_ARCH
|
||||||
ifndef CUDA_POWER_ARCH
|
ifndef CUDA_POWER_ARCH
|
||||||
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
|
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
|
||||||
endif # CUDA_POWER_ARCH
|
endif # CUDA_POWER_ARCH
|
||||||
endif # CUDA_DOCKER_ARCH
|
endif # CUDA_DOCKER_ARCH
|
||||||
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
||||||
|
@ -676,19 +683,22 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
||||||
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
|
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
unicode.o: unicode.cpp unicode.h
|
unicode.o: unicode.cpp unicode.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
unicode-data.o: unicode-data.cpp unicode-data.h
|
unicode-data.o: unicode-data.cpp unicode-data.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o sgemm.o
|
||||||
|
|
||||||
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
||||||
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
|
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
||||||
|
|
||||||
common.o: common/common.cpp $(COMMON_H_DEPS)
|
common.o: common/common.cpp $(COMMON_H_DEPS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
@ -756,7 +766,7 @@ batched: examples/batched/batched.cpp ggml.o llama.o $(C
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS)
|
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
@ -788,7 +798,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||||
|
|
||||||
|
@ -800,6 +810,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
@ -918,6 +932,10 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
|
tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
|
@ -2,6 +2,45 @@
|
||||||
|
|
||||||
import PackageDescription
|
import PackageDescription
|
||||||
|
|
||||||
|
var sources = [
|
||||||
|
"ggml.c",
|
||||||
|
"sgemm.cpp",
|
||||||
|
"llama.cpp",
|
||||||
|
"unicode.cpp",
|
||||||
|
"unicode-data.cpp",
|
||||||
|
"ggml-alloc.c",
|
||||||
|
"ggml-backend.c",
|
||||||
|
"ggml-quants.c",
|
||||||
|
]
|
||||||
|
|
||||||
|
var resources: [Resource] = []
|
||||||
|
var linkerSettings: [LinkerSetting] = []
|
||||||
|
var cSettings: [CSetting] = [
|
||||||
|
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||||
|
.unsafeFlags(["-fno-objc-arc"]),
|
||||||
|
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||||
|
// We should consider add this in the future when we drop support for iOS 14
|
||||||
|
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
||||||
|
// .define("ACCELERATE_NEW_LAPACK"),
|
||||||
|
// .define("ACCELERATE_LAPACK_ILP64")
|
||||||
|
]
|
||||||
|
|
||||||
|
#if canImport(Darwin)
|
||||||
|
sources.append("ggml-metal.m")
|
||||||
|
resources.append(.process("ggml-metal.metal"))
|
||||||
|
linkerSettings.append(.linkedFramework("Accelerate"))
|
||||||
|
cSettings.append(
|
||||||
|
contentsOf: [
|
||||||
|
.define("GGML_USE_ACCELERATE"),
|
||||||
|
.define("GGML_USE_METAL")
|
||||||
|
]
|
||||||
|
)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if os(Linux)
|
||||||
|
cSettings.append(.define("_GNU_SOURCE"))
|
||||||
|
#endif
|
||||||
|
|
||||||
let package = Package(
|
let package = Package(
|
||||||
name: "llama",
|
name: "llama",
|
||||||
platforms: [
|
platforms: [
|
||||||
|
@ -28,34 +67,11 @@ let package = Package(
|
||||||
"ggml-cuda.h",
|
"ggml-cuda.h",
|
||||||
"Makefile"
|
"Makefile"
|
||||||
],
|
],
|
||||||
sources: [
|
sources: sources,
|
||||||
"ggml.c",
|
resources: resources,
|
||||||
"llama.cpp",
|
|
||||||
"unicode.cpp",
|
|
||||||
"unicode-data.cpp",
|
|
||||||
"ggml-alloc.c",
|
|
||||||
"ggml-backend.c",
|
|
||||||
"ggml-quants.c",
|
|
||||||
"ggml-metal.m",
|
|
||||||
],
|
|
||||||
resources: [
|
|
||||||
.process("ggml-metal.metal")
|
|
||||||
],
|
|
||||||
publicHeadersPath: "spm-headers",
|
publicHeadersPath: "spm-headers",
|
||||||
cSettings: [
|
cSettings: cSettings,
|
||||||
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
linkerSettings: linkerSettings
|
||||||
.define("GGML_USE_ACCELERATE"),
|
|
||||||
.unsafeFlags(["-fno-objc-arc"]),
|
|
||||||
.define("GGML_USE_METAL"),
|
|
||||||
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
|
||||||
// We should consider add this in the future when we drop support for iOS 14
|
|
||||||
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
|
||||||
// .define("ACCELERATE_NEW_LAPACK"),
|
|
||||||
// .define("ACCELERATE_LAPACK_ILP64")
|
|
||||||
],
|
|
||||||
linkerSettings: [
|
|
||||||
.linkedFramework("Accelerate")
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
],
|
],
|
||||||
cxxLanguageStandard: .cxx11
|
cxxLanguageStandard: .cxx11
|
||||||
|
|
131
README-sycl.md
131
README-sycl.md
|
@ -3,14 +3,14 @@
|
||||||
- [Background](#background)
|
- [Background](#background)
|
||||||
- [News](#news)
|
- [News](#news)
|
||||||
- [OS](#os)
|
- [OS](#os)
|
||||||
- [Supported Devices](#supported-devices)
|
- [Hardware](#hardware)
|
||||||
- [Docker](#docker)
|
- [Docker](#docker)
|
||||||
- [Linux](#linux)
|
- [Linux](#linux)
|
||||||
- [Windows](#windows)
|
- [Windows](#windows)
|
||||||
- [Environment Variable](#environment-variable)
|
- [Environment Variable](#environment-variable)
|
||||||
- [Known Issue](#known-issue)
|
- [Known Issue](#known-issues)
|
||||||
- [Q&A](#q&a)
|
- [Q&A](#qa)
|
||||||
- [Todo](#todo)
|
- [TODO](#todo)
|
||||||
|
|
||||||
## Background
|
## Background
|
||||||
|
|
||||||
|
@ -24,19 +24,20 @@
|
||||||
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
|
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
|
||||||
|
|
||||||
### Llama.cpp + SYCL
|
### Llama.cpp + SYCL
|
||||||
This SYCL "backend" follows the same design found in other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. The oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
|
|
||||||
|
|
||||||
The llama.cpp SYCL backend supports:
|
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).
|
||||||
- Intel GPUs.
|
|
||||||
- Nvidia GPUs.
|
|
||||||
|
|
||||||
*Upcoming support: AMD GPUs*.
|
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
|
||||||
|
|
||||||
When targetting **Intel CPUs**, it is recommended to use llama.cpp for [x86_64](README.md#intel-onemkl) approach.
|
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
|
||||||
|
|
||||||
## News
|
## News
|
||||||
|
|
||||||
|
- 2024.4
|
||||||
|
- Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.
|
||||||
|
|
||||||
- 2024.3
|
- 2024.3
|
||||||
|
- Release binary files of Windows.
|
||||||
- A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
|
- A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
|
||||||
- New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
|
- New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
|
||||||
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
|
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
|
||||||
|
@ -54,25 +55,20 @@ When targetting **Intel CPUs**, it is recommended to use llama.cpp for [x86_64]
|
||||||
## OS
|
## OS
|
||||||
|
|
||||||
| OS | Status | Verified |
|
| OS | Status | Verified |
|
||||||
|-|-|-|
|
|---------|---------|------------------------------------|
|
||||||
| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 |
|
| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 |
|
||||||
| Windows | Support | Windows 11 |
|
| Windows | Support | Windows 11 |
|
||||||
|
|
||||||
|
|
||||||
## Supported devices
|
## Hardware
|
||||||
|
|
||||||
### Intel GPUs
|
### Intel GPU
|
||||||
|
|
||||||
The oneAPI Math Kernel Library, which the oneAPI base-toolkit includes, supports intel GPUs. In order to make it "visible", simply run the following:
|
**Verified devices**
|
||||||
```sh
|
|
||||||
source /opt/intel/oneapi/setvars.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
- **Tested devices**
|
|
||||||
|
|
||||||
| Intel GPU | Status | Verified Model |
|
| Intel GPU | Status | Verified Model |
|
||||||
|-|-|-|
|
|-------------------------------|---------|---------------------------------------|
|
||||||
|Intel Data Center Max Series| Support| Max 1550|
|
| Intel Data Center Max Series | Support | Max 1550, 1100 |
|
||||||
| Intel Data Center Flex Series | Support | Flex 170 |
|
| Intel Data Center Flex Series | Support | Flex 170 |
|
||||||
| Intel Arc Series | Support | Arc 770, 730M |
|
| Intel Arc Series | Support | Arc 770, 730M |
|
||||||
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
|
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
|
||||||
|
@ -80,30 +76,26 @@ source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
- Device memory can be a limitation when running a large model on an intel GPU. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
|
- **Memory**
|
||||||
|
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
|
||||||
|
|
||||||
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPUs and 4.0GB for discrete GPUs.
|
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
||||||
|
|
||||||
- If the iGPU has less than 80 EUs *(Execution Unit)*, the inference speed will likely be too slow for practical use.
|
- **Execution Unit (EU)**
|
||||||
|
- If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.
|
||||||
|
|
||||||
### Nvidia GPUs
|
### Other Vendor GPU
|
||||||
The BLAS acceleration on Nvidia GPUs through oneAPI can be obtained using the Nvidia plugins for oneAPI and the cuBLAS backend of the upstream oneMKL library. Details and instructions on how to setup the runtime and library can be found in [this section](#i-setup-environment)
|
|
||||||
|
|
||||||
- **Tested devices**
|
**Verified devices**
|
||||||
|
|
||||||
| Nvidia GPU | Status | Verified Model |
|
| Nvidia GPU | Status | Verified Model |
|
||||||
|-|-|-|
|
|--------------------------|---------|----------------|
|
||||||
| Ampere Series | Support | A100, A4000 |
|
| Ampere Series | Support | A100, A4000 |
|
||||||
| Ampere Series *(Mobile)* | Support | RTX 40 Series |
|
| Ampere Series *(Mobile)* | Support | RTX 40 Series |
|
||||||
|
|
||||||
*Notes:*
|
|
||||||
- Support for Nvidia targets through oneAPI is currently limited to Linux platforms.
|
|
||||||
|
|
||||||
- Please make sure the native oneAPI MKL *(dedicated to intel CPUs and GPUs)* is not "visible" at this stage to properly setup and use the built-from-source oneMKL with cuBLAS backend in llama.cpp for Nvidia GPUs.
|
|
||||||
|
|
||||||
|
|
||||||
## Docker
|
## Docker
|
||||||
The docker build option is currently limited to *intel GPU* targets.
|
The docker build option is currently limited to *intel GPU* targets.
|
||||||
|
|
||||||
### Build image
|
### Build image
|
||||||
```sh
|
```sh
|
||||||
# Using FP16
|
# Using FP16
|
||||||
|
@ -169,30 +161,11 @@ Platform #0: Intel(R) OpenCL HD Graphics
|
||||||
|
|
||||||
- **Nvidia GPU**
|
- **Nvidia GPU**
|
||||||
|
|
||||||
In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cublas)-* are installed.
|
In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
|
||||||
Installation can be verified by running the following:
|
|
||||||
```sh
|
|
||||||
nvidia-smi
|
|
||||||
```
|
|
||||||
Please make sure at least one CUDA device is available, which can be displayed like this *(here an A100-40GB Nvidia GPU)*:
|
|
||||||
```
|
|
||||||
+---------------------------------------------------------------------------------------+
|
|
||||||
| NVIDIA-SMI 535.54.03 Driver Version: 535.54.03 CUDA Version: 12.2 |
|
|
||||||
|-----------------------------------------+----------------------+----------------------+
|
|
||||||
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
|
||||||
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
|
||||||
| | | MIG M. |
|
|
||||||
|=========================================+======================+======================|
|
|
||||||
| 0 NVIDIA A100-PCIE-40GB On | 00000000:8D:00.0 Off | 0 |
|
|
||||||
| N/A 36C P0 57W / 250W | 4MiB / 40960MiB | 0% Default |
|
|
||||||
| | | Disabled |
|
|
||||||
+-----------------------------------------+----------------------+----------------------+
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
2. **Install Intel® oneAPI Base toolkit**
|
2. **Install Intel® oneAPI Base toolkit**
|
||||||
|
|
||||||
- **Base installation**
|
- **For Intel GPU**
|
||||||
|
|
||||||
The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
|
The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
|
||||||
|
|
||||||
|
@ -204,10 +177,10 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
|
||||||
|
|
||||||
- **Adding support to Nvidia GPUs**
|
- **Adding support to Nvidia GPUs**
|
||||||
|
|
||||||
**oneAPI**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
|
**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
|
||||||
|
|
||||||
|
|
||||||
**oneMKL**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
|
**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
git clone https://github.com/oneapi-src/oneMKL
|
git clone https://github.com/oneapi-src/oneMKL
|
||||||
|
@ -239,7 +212,7 @@ When targeting an intel GPU, the user should expect one or more level-zero devic
|
||||||
|
|
||||||
- **Nvidia GPU**
|
- **Nvidia GPU**
|
||||||
|
|
||||||
Similarly, user targetting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
|
Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
|
||||||
```
|
```
|
||||||
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix]
|
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix]
|
||||||
[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
|
[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
|
||||||
|
@ -257,10 +230,14 @@ source /opt/intel/oneapi/setvars.sh
|
||||||
mkdir -p build && cd build
|
mkdir -p build && cd build
|
||||||
|
|
||||||
# Option 1: Use FP16 for better performance in long-prompt inference
|
# Option 1: Use FP16 for better performance in long-prompt inference
|
||||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
||||||
|
# Or without "--build", run "make" next
|
||||||
|
|
||||||
# Option 2: Use FP32 by default
|
# Option 2: Use FP32 by default
|
||||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
|
#build all binary
|
||||||
|
cmake --build . --config Release -j -v
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Nvidia GPU
|
#### Nvidia GPU
|
||||||
|
@ -275,10 +252,14 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
|
||||||
mkdir -p build && cd build
|
mkdir -p build && cd build
|
||||||
|
|
||||||
# Option 1: Use FP16 for better performance in long-prompt inference
|
# Option 1: Use FP16 for better performance in long-prompt inference
|
||||||
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
# Option 2: Use FP32 by default
|
# Option 2: Use FP32 by default
|
||||||
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
|
#build all binary
|
||||||
|
cmake --build . --config Release -j -v
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### III. Run the inference
|
### III. Run the inference
|
||||||
|
@ -315,7 +296,7 @@ found 6 SYCL devices:
|
||||||
```
|
```
|
||||||
|
|
||||||
| Attribute | Note |
|
| Attribute | Note |
|
||||||
|-|-|
|
|------------------------|-------------------------------------------------------------|
|
||||||
| compute capability 1.3 | Level-zero driver/runtime, recommended |
|
| compute capability 1.3 | Level-zero driver/runtime, recommended |
|
||||||
| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
|
| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
|
||||||
|
|
||||||
|
@ -327,7 +308,7 @@ There are two device selection modes:
|
||||||
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
|
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
|
||||||
|
|
||||||
| Device selection | Parameter |
|
| Device selection | Parameter |
|
||||||
|-|-|
|
|------------------|----------------------------------------|
|
||||||
| Single device | --split-mode none --main-gpu DEVICE_ID |
|
| Single device | --split-mode none --main-gpu DEVICE_ID |
|
||||||
| Multiple devices | --split-mode layer (default) |
|
| Multiple devices | --split-mode layer (default) |
|
||||||
|
|
||||||
|
@ -358,7 +339,6 @@ Otherwise, you can run the script:
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `/bin/main` if faced with the issue.
|
|
||||||
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
|
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
|
@ -439,7 +419,7 @@ cd build
|
||||||
|
|
||||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
make
|
make -j
|
||||||
```
|
```
|
||||||
|
|
||||||
Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
|
Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
|
||||||
|
@ -488,7 +468,7 @@ found 6 SYCL devices:
|
||||||
```
|
```
|
||||||
|
|
||||||
| Attribute | Note |
|
| Attribute | Note |
|
||||||
|-|-|
|
|------------------------|-----------------------------------------------------------|
|
||||||
| compute capability 1.3 | Level-zero running time, recommended |
|
| compute capability 1.3 | Level-zero running time, recommended |
|
||||||
| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
|
| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
|
||||||
|
|
||||||
|
@ -501,7 +481,7 @@ There are two device selection modes:
|
||||||
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
|
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
|
||||||
|
|
||||||
| Device selection | Parameter |
|
| Device selection | Parameter |
|
||||||
|-|-|
|
|------------------|----------------------------------------|
|
||||||
| Single device | --split-mode none --main-gpu DEVICE_ID |
|
| Single device | --split-mode none --main-gpu DEVICE_ID |
|
||||||
| Multiple devices | --split-mode layer (default) |
|
| Multiple devices | --split-mode layer (default) |
|
||||||
|
|
||||||
|
@ -526,7 +506,6 @@ Otherwise, run the following wrapper script:
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
|
|
||||||
- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `main.exe` if faced with the issue.
|
|
||||||
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
|
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
|
@ -542,7 +521,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
#### Build
|
#### Build
|
||||||
|
|
||||||
| Name | Value | Function |
|
| Name | Value | Function |
|
||||||
|-|-|-|
|
|--------------------|-----------------------------------|---------------------------------------------|
|
||||||
| LLAMA_SYCL | ON (mandatory) | Enable build with SYCL code path. |
|
| LLAMA_SYCL | ON (mandatory) | Enable build with SYCL code path. |
|
||||||
| LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
|
| LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
|
||||||
| LLAMA_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
| LLAMA_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
||||||
|
@ -552,18 +531,12 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
#### Runtime
|
#### Runtime
|
||||||
|
|
||||||
| Name | Value | Function |
|
| Name | Value | Function |
|
||||||
|-|-|-|
|
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
|
||||||
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
|
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
|
||||||
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
||||||
|
|
||||||
## Known Issues
|
## Known Issues
|
||||||
|
|
||||||
- Hanging during startup
|
|
||||||
|
|
||||||
llama.cpp uses *mmap* as the default mode for reading the model file and copying it to the GPU. In some systems, `memcpy` might behave abnormally and therefore hang.
|
|
||||||
|
|
||||||
- **Solution**: add `--no-mmap` or `--mmap 0` flag to the `main` executable.
|
|
||||||
|
|
||||||
- `Split-mode:[row]` is not supported.
|
- `Split-mode:[row]` is not supported.
|
||||||
|
|
||||||
## Q&A
|
## Q&A
|
||||||
|
@ -575,7 +548,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
|
|
||||||
- General compiler error:
|
- General compiler error:
|
||||||
|
|
||||||
- Remove build folder or try a clean-build.
|
- Remove **build** folder or try a clean-build.
|
||||||
|
|
||||||
- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.
|
- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.
|
||||||
|
|
||||||
|
@ -592,6 +565,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
### **GitHub contribution**:
|
### **GitHub contribution**:
|
||||||
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
|
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
|
||||||
|
|
||||||
## Todo
|
## TODO
|
||||||
|
|
||||||
- Support row layer split for multiple card runs.
|
- Support row layer split for multiple card runs.
|
||||||
|
|
21
README.md
21
README.md
|
@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||||
|
|
||||||
### Recent API changes
|
### Recent API changes
|
||||||
|
|
||||||
|
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
|
||||||
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
|
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
|
||||||
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
|
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
|
||||||
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
|
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
|
||||||
|
@ -93,6 +94,7 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [x] LLaMA 2 🦙🦙
|
- [x] LLaMA 2 🦙🦙
|
||||||
- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
|
- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
|
||||||
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
|
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
|
||||||
|
- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
|
||||||
- [X] Falcon
|
- [X] Falcon
|
||||||
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
|
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
|
||||||
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
|
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
|
||||||
|
@ -119,6 +121,9 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [x] [Xverse](https://huggingface.co/models?search=xverse)
|
- [x] [Xverse](https://huggingface.co/models?search=xverse)
|
||||||
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
|
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
|
||||||
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
||||||
|
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
||||||
|
|
||||||
|
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
|
||||||
|
|
||||||
**Multimodal models:**
|
**Multimodal models:**
|
||||||
|
|
||||||
|
@ -181,7 +186,9 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
|
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
|
||||||
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
|
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
|
||||||
- [Dot](https://github.com/alexpinel/Dot) (GPL)
|
- [Dot](https://github.com/alexpinel/Dot) (GPL)
|
||||||
|
- [MindMac](https://mindmac.app) (proprietary)
|
||||||
|
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
||||||
|
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
||||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||||
|
|
||||||
---
|
---
|
||||||
|
@ -480,7 +487,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
|
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
|
||||||
|
|
||||||
| Option | Legal values | Default | Description |
|
| Option | Legal values | Default | Description |
|
||||||
|--------------------------------|------------------------|---------|-------------|
|
|--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
||||||
|
@ -492,7 +499,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
This provides BLAS acceleration on HIP-supported AMD GPUs.
|
This provides BLAS acceleration on HIP-supported AMD GPUs.
|
||||||
Make sure to have ROCm installed.
|
Make sure to have ROCm installed.
|
||||||
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
|
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
|
||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
```bash
|
```bash
|
||||||
|
@ -509,7 +516,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
||||||
```bash
|
```bash
|
||||||
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030
|
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
||||||
|
@ -517,7 +524,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
set PATH=%HIP_PATH%\bin;%PATH%
|
set PATH=%HIP_PATH%\bin;%PATH%
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
|
cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release ..
|
||||||
cmake --build .
|
cmake --build .
|
||||||
```
|
```
|
||||||
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
||||||
|
@ -529,7 +536,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
|
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
|
||||||
|
|
||||||
| Option | Legal values | Default | Description |
|
| Option | Legal values | Default | Description |
|
||||||
|-------------------------|------------------------|---------|-------------|
|
|-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
|
@ -740,7 +747,7 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver
|
||||||
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
|
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
|
||||||
|
|
||||||
| Model | Original size | Quantized size (Q4_0) |
|
| Model | Original size | Quantized size (Q4_0) |
|
||||||
|------:|--------------:|-----------------------:|
|
|------:|--------------:|----------------------:|
|
||||||
| 7B | 13 GB | 3.9 GB |
|
| 7B | 13 GB | 3.9 GB |
|
||||||
| 13B | 24 GB | 7.8 GB |
|
| 13B | 24 GB | 7.8 GB |
|
||||||
| 30B | 60 GB | 19.5 GB |
|
| 30B | 60 GB | 19.5 GB |
|
||||||
|
|
|
@ -49,11 +49,11 @@ If you intend to run multiple models in parallel with shared memory, it is your
|
||||||
|
|
||||||
1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.
|
1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.
|
||||||
|
|
||||||
1. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
|
2. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
|
||||||
|
|
||||||
1. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
|
3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
|
||||||
|
|
||||||
1. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
|
4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
|
||||||
|
|
||||||
## Reporting a vulnerability
|
## Reporting a vulnerability
|
||||||
|
|
||||||
|
|
15
build.zig
15
build.zig
|
@ -112,6 +112,7 @@ pub fn build(b: *std.build.Builder) !void {
|
||||||
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
|
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
|
||||||
|
|
||||||
const ggml = make.obj("ggml", "ggml.c");
|
const ggml = make.obj("ggml", "ggml.c");
|
||||||
|
const sgemm = make.obj("sgemm", "sgemm.cpp");
|
||||||
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
|
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
|
||||||
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
||||||
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
|
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
|
||||||
|
@ -128,14 +129,14 @@ pub fn build(b: *std.build.Builder) !void {
|
||||||
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
||||||
const llava = make.obj("llava", "examples/llava/llava.cpp");
|
const llava = make.obj("llava", "examples/llava/llava.cpp");
|
||||||
|
|
||||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser });
|
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
|
||||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
|
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
|
||||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
|
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
|
||||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
|
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
|
||||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
|
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
|
||||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
|
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
|
||||||
|
|
||||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
|
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
|
||||||
if (server.target.isWindows()) {
|
if (server.target.isWindows()) {
|
||||||
server.linkSystemLibrary("ws2_32");
|
server.linkSystemLibrary("ws2_32");
|
||||||
}
|
}
|
||||||
|
|
49
ci/run.sh
49
ci/run.sh
|
@ -153,6 +153,52 @@ function gg_sum_ctest_release {
|
||||||
gg_printf '```\n'
|
gg_printf '```\n'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# test_scripts_debug
|
||||||
|
|
||||||
|
function gg_run_test_scripts_debug {
|
||||||
|
cd ${SRC}
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||||
|
|
||||||
|
set +e
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_sum_test_scripts_debug {
|
||||||
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
|
gg_printf 'Runs test scripts in debug mode\n'
|
||||||
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
|
gg_printf '```\n'
|
||||||
|
gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
|
||||||
|
gg_printf '```\n'
|
||||||
|
gg_printf '\n'
|
||||||
|
}
|
||||||
|
|
||||||
|
# test_scripts_release
|
||||||
|
|
||||||
|
function gg_run_test_scripts_release {
|
||||||
|
cd ${SRC}
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||||
|
|
||||||
|
set +e
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_sum_test_scripts_release {
|
||||||
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
|
gg_printf 'Runs test scripts in release mode\n'
|
||||||
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
|
gg_printf '```\n'
|
||||||
|
gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
|
||||||
|
gg_printf '```\n'
|
||||||
|
gg_printf '\n'
|
||||||
|
}
|
||||||
|
|
||||||
function gg_get_model {
|
function gg_get_model {
|
||||||
local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
|
local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
|
||||||
local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
||||||
|
@ -642,6 +688,9 @@ test $ret -eq 0 && gg_run ctest_release
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
test $ret -eq 0 && gg_run embd_bge_small
|
test $ret -eq 0 && gg_run embd_bge_small
|
||||||
|
|
||||||
|
test $ret -eq 0 && gg_run test_scripts_debug
|
||||||
|
test $ret -eq 0 && gg_run test_scripts_release
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||||
if [ -z ${GG_BUILD_CUDA} ]; then
|
if [ -z ${GG_BUILD_CUDA} ]; then
|
||||||
test $ret -eq 0 && gg_run open_llama_3b_v2
|
test $ret -eq 0 && gg_run open_llama_3b_v2
|
||||||
|
|
|
@ -47,9 +47,6 @@ if (BUILD_SHARED_LIBS)
|
||||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(TARGET json-schema-to-grammar)
|
|
||||||
add_library(${TARGET} OBJECT json-schema-to-grammar.cpp json-schema-to-grammar.h)
|
|
||||||
|
|
||||||
set(TARGET common)
|
set(TARGET common)
|
||||||
|
|
||||||
add_library(${TARGET} STATIC
|
add_library(${TARGET} STATIC
|
||||||
|
@ -63,6 +60,7 @@ add_library(${TARGET} STATIC
|
||||||
grammar-parser.h
|
grammar-parser.h
|
||||||
grammar-parser.cpp
|
grammar-parser.cpp
|
||||||
json.hpp
|
json.hpp
|
||||||
|
json-schema-to-grammar.cpp
|
||||||
train.h
|
train.h
|
||||||
train.cpp
|
train.cpp
|
||||||
ngram-cache.h
|
ngram-cache.h
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "json.hpp"
|
||||||
|
#include "json-schema-to-grammar.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
@ -16,6 +18,7 @@
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
#include <codecvt>
|
||||||
|
|
||||||
#if defined(__APPLE__) && defined(__MACH__)
|
#if defined(__APPLE__) && defined(__MACH__)
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
|
@ -27,7 +30,6 @@
|
||||||
#ifndef NOMINMAX
|
#ifndef NOMINMAX
|
||||||
# define NOMINMAX
|
# define NOMINMAX
|
||||||
#endif
|
#endif
|
||||||
#include <codecvt>
|
|
||||||
#include <locale>
|
#include <locale>
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
|
@ -68,6 +70,8 @@
|
||||||
#define LLAMA_CURL_MAX_HEADER_LENGTH 256
|
#define LLAMA_CURL_MAX_HEADER_LENGTH 256
|
||||||
#endif // LLAMA_USE_CURL
|
#endif // LLAMA_USE_CURL
|
||||||
|
|
||||||
|
using json = nlohmann::ordered_json;
|
||||||
|
|
||||||
int32_t get_num_physical_cores() {
|
int32_t get_num_physical_cores() {
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
// enumerate the set of thread siblings, num entries is num cores
|
// enumerate the set of thread siblings, num entries is num cores
|
||||||
|
@ -104,6 +108,79 @@ int32_t get_num_physical_cores() {
|
||||||
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
|
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(__x86_64__) && defined(__linux__)
|
||||||
|
#include <pthread.h>
|
||||||
|
|
||||||
|
static void cpuid(unsigned leaf, unsigned subleaf,
|
||||||
|
unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
|
||||||
|
__asm__("movq\t%%rbx,%%rsi\n\t"
|
||||||
|
"cpuid\n\t"
|
||||||
|
"xchgq\t%%rbx,%%rsi"
|
||||||
|
: "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
|
||||||
|
: "0"(leaf), "2"(subleaf));
|
||||||
|
}
|
||||||
|
|
||||||
|
static int pin_cpu(int cpu) {
|
||||||
|
cpu_set_t mask;
|
||||||
|
CPU_ZERO(&mask);
|
||||||
|
CPU_SET(cpu, &mask);
|
||||||
|
return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool is_hybrid_cpu(void) {
|
||||||
|
unsigned eax, ebx, ecx, edx;
|
||||||
|
cpuid(7, 0, &eax, &ebx, &ecx, &edx);
|
||||||
|
return !!(edx & (1u << 15));
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool is_running_on_efficiency_core(void) {
|
||||||
|
unsigned eax, ebx, ecx, edx;
|
||||||
|
cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
|
||||||
|
int intel_atom = 0x20;
|
||||||
|
int core_type = (eax & 0xff000000u) >> 24;
|
||||||
|
return core_type == intel_atom;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int count_math_cpus(int cpu_count) {
|
||||||
|
int result = 0;
|
||||||
|
for (int cpu = 0; cpu < cpu_count; ++cpu) {
|
||||||
|
if (pin_cpu(cpu)) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
if (is_running_on_efficiency_core()) {
|
||||||
|
continue; // efficiency cores harm lockstep threading
|
||||||
|
}
|
||||||
|
++cpu; // hyperthreading isn't useful for linear algebra
|
||||||
|
++result;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // __x86_64__ && __linux__
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns number of CPUs on system that are useful for math.
|
||||||
|
*/
|
||||||
|
int get_math_cpu_count() {
|
||||||
|
#if defined(__x86_64__) && defined(__linux__)
|
||||||
|
int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
|
||||||
|
if (cpu_count < 1) {
|
||||||
|
return get_num_physical_cores();
|
||||||
|
}
|
||||||
|
if (is_hybrid_cpu()) {
|
||||||
|
cpu_set_t affinity;
|
||||||
|
if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
|
||||||
|
int result = count_math_cpus(cpu_count);
|
||||||
|
pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
|
||||||
|
if (result > 0) {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return get_num_physical_cores();
|
||||||
|
}
|
||||||
|
|
||||||
void process_escapes(std::string & input) {
|
void process_escapes(std::string & input) {
|
||||||
std::size_t input_len = input.length();
|
std::size_t input_len = input.length();
|
||||||
std::size_t output_idx = 0;
|
std::size_t output_idx = 0;
|
||||||
|
@ -1148,6 +1225,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
);
|
);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (arg == "-j" || arg == "--json-schema") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (arg == "--override-kv") {
|
if (arg == "--override-kv") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -1353,6 +1438,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
|
printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
|
||||||
printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
|
printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
|
||||||
printf(" --grammar-file FNAME file to read grammar from\n");
|
printf(" --grammar-file FNAME file to read grammar from\n");
|
||||||
|
printf(" -j SCHEMA, --json-schema SCHEMA\n");
|
||||||
|
printf(" JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n");
|
||||||
|
printf(" For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n");
|
||||||
printf(" --cfg-negative-prompt PROMPT\n");
|
printf(" --cfg-negative-prompt PROMPT\n");
|
||||||
printf(" negative prompt to use for guidance. (default: empty)\n");
|
printf(" negative prompt to use for guidance. (default: empty)\n");
|
||||||
printf(" --cfg-negative-prompt-file FNAME\n");
|
printf(" --cfg-negative-prompt-file FNAME\n");
|
||||||
|
@ -1500,6 +1588,77 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
|
||||||
GGML_UNREACHABLE();
|
GGML_UNREACHABLE();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Validate if a filename is safe to use
|
||||||
|
// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
|
||||||
|
bool validate_file_name(const std::string & filename) {
|
||||||
|
if (!filename.length()) {
|
||||||
|
// Empty filename invalid
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (filename.length() > 255) {
|
||||||
|
// Limit at common largest possible filename on Linux filesystems
|
||||||
|
// to avoid unnecessary further validation
|
||||||
|
// (On systems with smaller limits it will be caught by the OS)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::u32string filename_utf32;
|
||||||
|
try {
|
||||||
|
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
||||||
|
filename_utf32 = converter.from_bytes(filename);
|
||||||
|
|
||||||
|
// If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
|
||||||
|
// or invalid encodings were encountered. Reject such attempts
|
||||||
|
std::string filename_reencoded = converter.to_bytes(filename_utf32);
|
||||||
|
if (filename_reencoded != filename) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} catch (const std::exception &) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for forbidden codepoints:
|
||||||
|
// - Control characters
|
||||||
|
// - Unicode equivalents of illegal characters
|
||||||
|
// - UTF-16 surrogate pairs
|
||||||
|
// - UTF-8 replacement character
|
||||||
|
// - Byte order mark (BOM)
|
||||||
|
// - Illegal characters: / \ : * ? " < > |
|
||||||
|
for (char32_t c : filename_utf32) {
|
||||||
|
if (c <= 0x1F // Control characters (C0)
|
||||||
|
|| c == 0x7F // Control characters (DEL)
|
||||||
|
|| (c >= 0x80 && c <= 0x9F) // Control characters (C1)
|
||||||
|
|| c == 0xFF0E // Fullwidth Full Stop (period equivalent)
|
||||||
|
|| c == 0x2215 // Division Slash (forward slash equivalent)
|
||||||
|
|| c == 0x2216 // Set Minus (backslash equivalent)
|
||||||
|
|| (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
|
||||||
|
|| c == 0xFFFD // Replacement Character (UTF-8)
|
||||||
|
|| c == 0xFEFF // Byte Order Mark (BOM)
|
||||||
|
|| c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
|
||||||
|
|| c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
|
||||||
|
// Unicode and other whitespace is not affected, only 0x20 space
|
||||||
|
if (filename.front() == ' ' || filename.back() == ' ' || filename.back() == '.') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead)
|
||||||
|
if (filename.find("..") != std::string::npos) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reject "."
|
||||||
|
if (filename == ".") {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// String utils
|
// String utils
|
||||||
//
|
//
|
||||||
|
@ -1674,6 +1833,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
||||||
cparams.pooling_type = params.pooling_type;
|
cparams.pooling_type = params.pooling_type;
|
||||||
cparams.defrag_thold = params.defrag_thold;
|
cparams.defrag_thold = params.defrag_thold;
|
||||||
|
cparams.cb_eval = params.cb_eval;
|
||||||
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
||||||
cparams.offload_kqv = !params.no_kv_offload;
|
cparams.offload_kqv = !params.no_kv_offload;
|
||||||
|
|
||||||
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
||||||
|
@ -2121,7 +2282,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
if (params.warmup) {
|
||||||
LOG("warming up the model with an empty run\n");
|
LOG("warming up the model with an empty run\n");
|
||||||
|
|
||||||
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
||||||
|
@ -2141,23 +2302,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> llama_tokenize(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_bos,
|
bool add_special,
|
||||||
bool special) {
|
bool parse_special) {
|
||||||
return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
|
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> llama_tokenize(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_bos,
|
bool add_special,
|
||||||
bool special) {
|
bool parse_special) {
|
||||||
// upper limit for the number of tokens
|
// upper limit for the number of tokens
|
||||||
int n_tokens = text.length() + add_bos;
|
int n_tokens = text.length() + 2 * add_special;
|
||||||
std::vector<llama_token> result(n_tokens);
|
std::vector<llama_token> result(n_tokens);
|
||||||
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
|
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||||
if (n_tokens < 0) {
|
if (n_tokens < 0) {
|
||||||
result.resize(-n_tokens);
|
result.resize(-n_tokens);
|
||||||
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
|
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||||
GGML_ASSERT(check == -n_tokens);
|
GGML_ASSERT(check == -n_tokens);
|
||||||
} else {
|
} else {
|
||||||
result.resize(n_tokens);
|
result.resize(n_tokens);
|
||||||
|
|
|
@ -39,6 +39,7 @@ extern char const *LLAMA_BUILD_TARGET;
|
||||||
|
|
||||||
struct llama_control_vector_load_info;
|
struct llama_control_vector_load_info;
|
||||||
|
|
||||||
|
int get_math_cpu_count();
|
||||||
int32_t get_num_physical_cores();
|
int32_t get_num_physical_cores();
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -48,7 +49,7 @@ int32_t get_num_physical_cores();
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||||
|
|
||||||
int32_t n_threads = get_num_physical_cores();
|
int32_t n_threads = get_math_cpu_count();
|
||||||
int32_t n_threads_draft = -1;
|
int32_t n_threads_draft = -1;
|
||||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||||
int32_t n_threads_batch_draft = -1;
|
int32_t n_threads_batch_draft = -1;
|
||||||
|
@ -80,6 +81,9 @@ struct gpt_params {
|
||||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||||
|
|
||||||
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||||
|
void * cb_eval_user_data = nullptr;
|
||||||
|
|
||||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||||
|
|
||||||
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||||
|
@ -156,6 +160,7 @@ struct gpt_params {
|
||||||
bool infill = false; // use infill mode
|
bool infill = false; // use infill mode
|
||||||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||||
bool no_kv_offload = false; // disable KV offloading
|
bool no_kv_offload = false; // disable KV offloading
|
||||||
|
bool warmup = true; // warmup run
|
||||||
|
|
||||||
std::string cache_type_k = "f16"; // KV cache data type for the K
|
std::string cache_type_k = "f16"; // KV cache data type for the K
|
||||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||||
|
@ -179,6 +184,8 @@ std::string gpt_random_prompt(std::mt19937 & rng);
|
||||||
|
|
||||||
void process_escapes(std::string& input);
|
void process_escapes(std::string& input);
|
||||||
|
|
||||||
|
bool validate_file_name(const std::string & filename);
|
||||||
|
|
||||||
//
|
//
|
||||||
// String utils
|
// String utils
|
||||||
//
|
//
|
||||||
|
@ -221,14 +228,14 @@ void llama_batch_add(
|
||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> llama_tokenize(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_bos,
|
bool add_special,
|
||||||
bool special = false);
|
bool parse_special = false);
|
||||||
|
|
||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> llama_tokenize(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_bos,
|
bool add_special,
|
||||||
bool special = false);
|
bool parse_special = false);
|
||||||
|
|
||||||
// tokenizes a token into a piece
|
// tokenizes a token into a piece
|
||||||
// should work similar to Python's `tokenizer.id_to_piece`
|
// should work similar to Python's `tokenizer.id_to_piece`
|
||||||
|
|
|
@ -11,35 +11,101 @@
|
||||||
|
|
||||||
using json = nlohmann::ordered_json;
|
using json = nlohmann::ordered_json;
|
||||||
|
|
||||||
|
template <typename Iterator>
|
||||||
|
static std::string join(Iterator begin, Iterator end, const std::string & separator);
|
||||||
|
|
||||||
|
static std::string repeat(const std::string & str, size_t n);
|
||||||
|
|
||||||
|
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
|
||||||
|
if (separator_rule.empty()) {
|
||||||
|
if (min_items == 0 && max_items == 1) {
|
||||||
|
return item_rule + "?";
|
||||||
|
} else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
|
||||||
|
return item_rule + "+";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string result;
|
||||||
|
if (min_items > 0) {
|
||||||
|
if (item_rule_is_literal && separator_rule.empty()) {
|
||||||
|
result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
|
||||||
|
} else {
|
||||||
|
std::vector<std::string> items(min_items, item_rule);
|
||||||
|
result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
|
||||||
|
auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
|
||||||
|
|
||||||
|
if (up_to_n == 0) {
|
||||||
|
return "";
|
||||||
|
} else if (up_to_n == 1) {
|
||||||
|
return "(" + content + ")?";
|
||||||
|
} else if (!separator_rule.empty() && !prefix_with_sep) {
|
||||||
|
return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
|
||||||
|
} else {
|
||||||
|
std::string res = repeat("(" + content + " ", up_to_n);
|
||||||
|
// strip trailing space
|
||||||
|
res = res.substr(0, res.length() - 1);
|
||||||
|
res += repeat(")?", up_to_n);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (min_items > 0 && max_items != min_items) {
|
||||||
|
result += " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (max_items != std::numeric_limits<int>::max()) {
|
||||||
|
result += opt_repetitions(max_items - min_items, min_items > 0);
|
||||||
|
} else {
|
||||||
|
std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
|
||||||
|
if (min_items == 0 && !separator_rule.empty()) {
|
||||||
|
result = "(" + item_rule + " " + item_operator + "*)?";
|
||||||
|
} else {
|
||||||
|
result += item_operator + "*";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
const std::string SPACE_RULE = "\" \"?";
|
const std::string SPACE_RULE = "\" \"?";
|
||||||
|
|
||||||
std::unordered_map<std::string, std::string> PRIMITIVE_RULES = {
|
struct BuiltinRule {
|
||||||
{"boolean", "(\"true\" | \"false\") space"},
|
std::string content;
|
||||||
{"number", "(\"-\"? ([0-9] | [1-9] [0-9]*)) (\".\" [0-9]+)? ([eE] [-+]? [0-9]+)? space"},
|
std::vector<std::string> deps;
|
||||||
{"integer", "(\"-\"? ([0-9] | [1-9] [0-9]*)) space"},
|
|
||||||
{"value", "object | array | string | number | boolean"},
|
|
||||||
{"object", "\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space"},
|
|
||||||
{"array", "\"[\" space ( value (\",\" space value)* )? \"]\" space"},
|
|
||||||
{"uuid", "\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
|
|
||||||
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
|
|
||||||
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
|
|
||||||
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
|
|
||||||
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space"},
|
|
||||||
{"string", " \"\\\"\" (\n"
|
|
||||||
" [^\"\\\\] |\n"
|
|
||||||
" \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])\n"
|
|
||||||
" )* \"\\\"\" space"},
|
|
||||||
{"null", "\"null\" space"}
|
|
||||||
};
|
};
|
||||||
std::vector<std::string> OBJECT_RULE_NAMES = {"object", "array", "string", "number", "boolean", "null", "value"};
|
|
||||||
|
|
||||||
std::unordered_map<std::string, std::string> DATE_RULES = {
|
const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
|
||||||
{"date", "[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )"},
|
|
||||||
{"time", "([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )"},
|
std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
|
||||||
{"date-time", "date \"T\" time"},
|
{"boolean", {"(\"true\" | \"false\") space", {}}},
|
||||||
{"date-string", "\"\\\"\" date \"\\\"\" space"},
|
{"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
|
||||||
{"time-string", "\"\\\"\" time \"\\\"\" space"},
|
{"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
|
||||||
{"date-time-string", "\"\\\"\" date-time \"\\\"\" space"}
|
{"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
|
||||||
|
{"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
|
||||||
|
{"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
|
||||||
|
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
|
||||||
|
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
|
||||||
|
{"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
|
||||||
|
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
|
||||||
|
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
|
||||||
|
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
|
||||||
|
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
|
||||||
|
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
|
||||||
|
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
|
||||||
|
{"null", {"\"null\" space", {}}},
|
||||||
|
};
|
||||||
|
|
||||||
|
std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
|
||||||
|
{"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
|
||||||
|
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
|
||||||
|
{"date-time", {"date \"T\" time", {"date", "time"}}},
|
||||||
|
{"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
|
||||||
|
{"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
|
||||||
|
{"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}}
|
||||||
};
|
};
|
||||||
|
|
||||||
static bool is_reserved_name(const std::string & name) {
|
static bool is_reserved_name(const std::string & name) {
|
||||||
|
@ -47,7 +113,7 @@ static bool is_reserved_name(const std::string & name) {
|
||||||
if (RESERVED_NAMES.empty()) {
|
if (RESERVED_NAMES.empty()) {
|
||||||
RESERVED_NAMES.insert("root");
|
RESERVED_NAMES.insert("root");
|
||||||
for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
|
for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
|
||||||
for (const auto &p : DATE_RULES) RESERVED_NAMES.insert(p.first);
|
for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
|
||||||
}
|
}
|
||||||
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
|
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
|
||||||
}
|
}
|
||||||
|
@ -192,7 +258,7 @@ private:
|
||||||
if (_dotall) {
|
if (_dotall) {
|
||||||
rule = "[\\U00000000-\\U0010FFFF]";
|
rule = "[\\U00000000-\\U0010FFFF]";
|
||||||
} else {
|
} else {
|
||||||
rule = "[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]";
|
rule = "[^\\x0A\\x0D]";
|
||||||
}
|
}
|
||||||
return _add_rule("dot", rule);
|
return _add_rule("dot", rule);
|
||||||
};
|
};
|
||||||
|
@ -308,13 +374,6 @@ private:
|
||||||
auto &sub = last.first;
|
auto &sub = last.first;
|
||||||
auto sub_is_literal = last.second;
|
auto sub_is_literal = last.second;
|
||||||
|
|
||||||
if (min_times == 0 && max_times == std::numeric_limits<int>::max()) {
|
|
||||||
sub += "*";
|
|
||||||
} else if (min_times == 0 && max_times == 1) {
|
|
||||||
sub += "?";
|
|
||||||
} else if (min_times == 1 && max_times == std::numeric_limits<int>::max()) {
|
|
||||||
sub += "+";
|
|
||||||
} else {
|
|
||||||
if (!sub_is_literal) {
|
if (!sub_is_literal) {
|
||||||
std::string & sub_id = sub_rule_ids[sub];
|
std::string & sub_id = sub_rule_ids[sub];
|
||||||
if (sub_id.empty()) {
|
if (sub_id.empty()) {
|
||||||
|
@ -322,33 +381,14 @@ private:
|
||||||
}
|
}
|
||||||
sub = sub_id;
|
sub = sub_id;
|
||||||
}
|
}
|
||||||
std::string result;
|
seq.back().first = build_repetition(
|
||||||
if (sub_is_literal && min_times > 0) {
|
sub_is_literal ? "\"" + sub + "\"" : sub,
|
||||||
result = "\"" + repeat(sub.substr(1, sub.length() - 2), min_times) + "\"";
|
min_times,
|
||||||
} else {
|
max_times,
|
||||||
for (int j = 0; j < min_times; j++) {
|
"",
|
||||||
if (j > 0) {
|
sub_is_literal
|
||||||
result += " ";
|
);
|
||||||
}
|
|
||||||
result += sub;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (min_times > 0 && min_times < max_times) {
|
|
||||||
result += " ";
|
|
||||||
}
|
|
||||||
if (max_times == std::numeric_limits<int>::max()) {
|
|
||||||
result += sub + "*";
|
|
||||||
} else {
|
|
||||||
for (int j = min_times; j < max_times; j++) {
|
|
||||||
if (j > min_times) {
|
|
||||||
result += " ";
|
|
||||||
}
|
|
||||||
result += sub + "?";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
seq.back().first = result;
|
|
||||||
seq.back().second = false;
|
seq.back().second = false;
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
std::string literal;
|
std::string literal;
|
||||||
auto is_non_literal = [&](char c) {
|
auto is_non_literal = [&](char c) {
|
||||||
|
@ -424,7 +464,7 @@ private:
|
||||||
if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
|
if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
|
||||||
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
|
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
|
||||||
std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
|
std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
|
||||||
std::string kv_rule = _add_rule(sub_name + "-kv", _add_rule("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
|
std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
|
||||||
prop_kv_rule_names["*"] = kv_rule;
|
prop_kv_rule_names["*"] = kv_rule;
|
||||||
optional_props.push_back("*");
|
optional_props.push_back("*");
|
||||||
}
|
}
|
||||||
|
@ -486,6 +526,25 @@ private:
|
||||||
return rule;
|
return rule;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string _add_primitive(const std::string & name, const BuiltinRule & rule) {
|
||||||
|
auto n = _add_rule(name, rule.content);
|
||||||
|
for (const auto & dep : rule.deps) {
|
||||||
|
BuiltinRule dep_rule;
|
||||||
|
auto it = PRIMITIVE_RULES.find(dep);
|
||||||
|
if (it == PRIMITIVE_RULES.end()) {
|
||||||
|
it = STRING_FORMAT_RULES.find(dep);
|
||||||
|
if (it == STRING_FORMAT_RULES.end()) {
|
||||||
|
_errors.push_back("Rule " + dep + " not known");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (_rules.find(dep) == _rules.end()) {
|
||||||
|
_add_primitive(dep, it->second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
SchemaConverter(
|
SchemaConverter(
|
||||||
const std::function<json(const std::string &)> & fetch_json,
|
const std::function<json(const std::string &)> & fetch_json,
|
||||||
|
@ -647,49 +706,33 @@ public:
|
||||||
return _add_rule(rule_name, rule);
|
return _add_rule(rule_name, rule);
|
||||||
} else {
|
} else {
|
||||||
std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
|
std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
|
||||||
std::string list_item_operator = "( \",\" space " + item_rule_name + " )";
|
|
||||||
std::string successive_items;
|
|
||||||
int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
|
int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
|
||||||
json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
|
json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
|
||||||
int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : -1;
|
int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();
|
||||||
if (min_items > 0) {
|
|
||||||
successive_items += repeat(list_item_operator, min_items - 1);
|
return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
|
||||||
min_items--;
|
|
||||||
}
|
|
||||||
if (max_items >= 0 && max_items > min_items) {
|
|
||||||
successive_items += repeat(list_item_operator + "?", max_items - min_items - 1);
|
|
||||||
} else {
|
|
||||||
successive_items += list_item_operator + "*";
|
|
||||||
}
|
|
||||||
std::string rule;
|
|
||||||
if (min_items == 0) {
|
|
||||||
rule = "\"[\" space ( " + item_rule_name + " " + successive_items + " )? \"]\" space";
|
|
||||||
} else {
|
|
||||||
rule = "\"[\" space " + item_rule_name + " " + successive_items + " \"]\" space";
|
|
||||||
}
|
|
||||||
return _add_rule(rule_name, rule);
|
|
||||||
}
|
}
|
||||||
} else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
|
} else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
|
||||||
return _visit_pattern(schema["pattern"], rule_name);
|
return _visit_pattern(schema["pattern"], rule_name);
|
||||||
} else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
|
} else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
|
||||||
return _add_rule(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
|
return _add_primitive(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
|
||||||
} else if ((schema_type.is_null() || schema_type == "string") && DATE_RULES.find(schema_format) != DATE_RULES.end()) {
|
} else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
|
||||||
for (const auto & kv : DATE_RULES) {
|
auto prim_name = schema_format + "-string";
|
||||||
_add_rule(kv.first, kv.second);
|
return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name)));
|
||||||
}
|
} else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
|
||||||
return schema_format + "-string";
|
std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
|
||||||
|
int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
|
||||||
|
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
|
||||||
|
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
|
||||||
} else if (schema.empty() || schema_type == "object") {
|
} else if (schema.empty() || schema_type == "object") {
|
||||||
for (const auto & n : OBJECT_RULE_NAMES) {
|
return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
|
||||||
_add_rule(n, PRIMITIVE_RULES.at(n));
|
|
||||||
}
|
|
||||||
return _add_rule(rule_name, "object");
|
|
||||||
} else {
|
} else {
|
||||||
if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
|
if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
|
||||||
_errors.push_back("Unrecognized schema: " + schema.dump());
|
_errors.push_back("Unrecognized schema: " + schema.dump());
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
// TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
|
// TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
|
||||||
return _add_rule(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
|
return _add_primitive(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -129,7 +129,7 @@ llama_token llama_sampling_sample(
|
||||||
struct llama_sampling_context * ctx_sampling,
|
struct llama_sampling_context * ctx_sampling,
|
||||||
struct llama_context * ctx_main,
|
struct llama_context * ctx_main,
|
||||||
struct llama_context * ctx_cfg,
|
struct llama_context * ctx_cfg,
|
||||||
int idx = 0);
|
int idx = -1);
|
||||||
|
|
||||||
// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
|
// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
|
||||||
llama_token_data_array llama_sampling_prepare(
|
llama_token_data_array llama_sampling_prepare(
|
||||||
|
|
|
@ -43,17 +43,18 @@ AnyModel = TypeVar("AnyModel", bound="type[Model]")
|
||||||
class Model(ABC):
|
class Model(ABC):
|
||||||
_model_classes: dict[str, type[Model]] = {}
|
_model_classes: dict[str, type[Model]] = {}
|
||||||
|
|
||||||
def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
|
def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool):
|
||||||
self.dir_model = dir_model
|
self.dir_model = dir_model
|
||||||
self.ftype = ftype
|
self.ftype = ftype
|
||||||
self.fname_out = fname_out
|
self.fname_out = fname_out
|
||||||
self.is_big_endian = is_big_endian
|
self.is_big_endian = is_big_endian
|
||||||
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
||||||
|
self.use_temp_file = use_temp_file
|
||||||
self.is_safetensors = self._is_model_safetensors()
|
self.is_safetensors = self._is_model_safetensors()
|
||||||
self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
|
self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
|
||||||
self.part_names = self._get_part_names()
|
self.part_names = self._get_part_names()
|
||||||
self.hparams = Model.load_hparams(self.dir_model)
|
self.hparams = Model.load_hparams(self.dir_model)
|
||||||
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
|
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
||||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -160,7 +161,7 @@ class Model(ABC):
|
||||||
data = data.astype(np.float32)
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
||||||
data = data.astype(np.float32)
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
@ -227,15 +228,14 @@ class Model(ABC):
|
||||||
return ("pytorch_model.bin",)
|
return ("pytorch_model.bin",)
|
||||||
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
|
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
|
||||||
|
|
||||||
def _set_vocab_gpt2(self):
|
# used for GPT-2 BPE and WordPiece vocabs
|
||||||
dir_model = self.dir_model
|
def get_basic_vocab(self) -> tuple[list[str], list[int]]:
|
||||||
hparams = self.hparams
|
|
||||||
tokens: list[str] = []
|
tokens: list[str] = []
|
||||||
toktypes: list[int] = []
|
toktypes: list[int] = []
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
|
||||||
assert max(tokenizer.vocab.values()) < vocab_size
|
assert max(tokenizer.vocab.values()) < vocab_size
|
||||||
|
|
||||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
||||||
|
@ -255,11 +255,15 @@ class Model(ABC):
|
||||||
tokens.append(reverse_vocab[i])
|
tokens.append(reverse_vocab[i])
|
||||||
toktypes.append(gguf.TokenType.NORMAL)
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
|
||||||
|
return tokens, toktypes
|
||||||
|
|
||||||
|
def _set_vocab_gpt2(self) -> None:
|
||||||
|
tokens, toktypes = self.get_basic_vocab()
|
||||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||||
self.gguf_writer.add_token_list(tokens)
|
self.gguf_writer.add_token_list(tokens)
|
||||||
self.gguf_writer.add_token_types(toktypes)
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def _set_vocab_qwen(self):
|
def _set_vocab_qwen(self):
|
||||||
|
@ -1203,9 +1207,91 @@ class StableLMModel(Model):
|
||||||
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
|
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
|
||||||
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
|
||||||
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
||||||
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
|
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
||||||
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||||
|
n_head = self.hparams.get("num_attention_heads")
|
||||||
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
|
q_norms = dict()
|
||||||
|
k_norms = dict()
|
||||||
|
for name, data_torch in self.get_tensors():
|
||||||
|
# we don't need these
|
||||||
|
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
|
||||||
|
continue
|
||||||
|
|
||||||
|
old_dtype = data_torch.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||||
|
data_torch = data_torch.to(torch.float32)
|
||||||
|
|
||||||
|
data = data_torch.squeeze().numpy()
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
if name.find("q_layernorm.norms") != -1:
|
||||||
|
q_norms[name] = data
|
||||||
|
if len(q_norms) >= (block_count * n_head):
|
||||||
|
self._stack_qk_norm(block_count, name, tensor_map, n_head, q_norms, n_dims, layer_name="q_layernorm")
|
||||||
|
continue
|
||||||
|
if name.find("k_layernorm.norms") != -1:
|
||||||
|
k_norms[name] = data
|
||||||
|
if len(k_norms) >= (block_count * n_kv_head):
|
||||||
|
self._stack_qk_norm(block_count, name, tensor_map, n_kv_head, k_norms, n_dims, layer_name="k_layernorm")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if self.ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, layer_name="q_layernorm"):
|
||||||
|
for bid in range(block_count):
|
||||||
|
datas = []
|
||||||
|
for xid in range(n_head):
|
||||||
|
ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
|
||||||
|
datas.append(norms[ename])
|
||||||
|
del norms[ename]
|
||||||
|
data = np.stack(datas, axis=0)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
|
||||||
|
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
||||||
class LlamaModel(Model):
|
class LlamaModel(Model):
|
||||||
|
@ -1217,6 +1303,14 @@ class LlamaModel(Model):
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
self._set_vocab_llama_hf()
|
self._set_vocab_llama_hf()
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
|
||||||
|
special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
|
||||||
|
special_vocab._set_special_token("prefix", 32007)
|
||||||
|
special_vocab._set_special_token("suffix", 32008)
|
||||||
|
special_vocab._set_special_token("middle", 32009)
|
||||||
|
special_vocab._set_special_token("eot", 32010)
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
|
@ -1424,6 +1518,102 @@ class GrokModel(Model):
|
||||||
self.gguf_writer.add_tensor(new_name, data)
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("DbrxForCausalLM")
|
||||||
|
class DbrxModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.DBRX
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
ffn_config = self.hparams["ffn_config"]
|
||||||
|
attn_config = self.hparams["attn_config"]
|
||||||
|
self.gguf_writer.add_name(self.hparams["model_type"])
|
||||||
|
self.gguf_writer.add_block_count(self.hparams["n_layers"])
|
||||||
|
|
||||||
|
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
||||||
|
self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
|
||||||
|
|
||||||
|
self.gguf_writer.add_head_count(self.hparams["n_heads"])
|
||||||
|
self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
|
||||||
|
|
||||||
|
self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
|
||||||
|
|
||||||
|
self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
|
||||||
|
self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
|
||||||
|
|
||||||
|
self.gguf_writer.add_layer_norm_eps(1e-5)
|
||||||
|
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
print(f"gguf: file type = {self.ftype}")
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
block_count = self.hparams.get("n_layers")
|
||||||
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||||
|
for name, data_torch in self.get_tensors():
|
||||||
|
n_expert = self.hparams["ffn_config"]["moe_num_experts"]
|
||||||
|
n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
|
||||||
|
n_embd = self.hparams["d_model"]
|
||||||
|
|
||||||
|
# Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
|
||||||
|
# original implementation expects (n_expert, n_ff, n_embd) for all experts weights
|
||||||
|
# But llama.cpp moe graph works differently
|
||||||
|
# AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
|
||||||
|
# so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
|
||||||
|
exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
|
||||||
|
"ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert}
|
||||||
|
"ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
|
||||||
|
experts = False
|
||||||
|
for exp_tensor_name in exp_tensor_names.keys():
|
||||||
|
if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
|
||||||
|
experts = True
|
||||||
|
data_torch = data_torch.view(n_expert, n_ff, n_embd)
|
||||||
|
if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
|
||||||
|
data_torch = data_torch.permute(*permute_tensor)
|
||||||
|
break
|
||||||
|
|
||||||
|
old_dtype = data_torch.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||||
|
data_torch = data_torch.to(torch.float32)
|
||||||
|
|
||||||
|
data = data_torch.squeeze().numpy()
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
# In MoE models the ffn tensors are typically most of the model weights,
|
||||||
|
# and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
|
||||||
|
# Every other model has the weight names ending in .weight,
|
||||||
|
# let's assume that is the convention which is not the case for dbrx:
|
||||||
|
# https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
|
||||||
|
new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# Most of the codebase that takes in 1D tensors only handles F32 tensors
|
||||||
|
# and most of the outputs tensors are F32.
|
||||||
|
if data_dtype != np.float32 and n_dims == 1:
|
||||||
|
print(f"Can not map tensor {name!r}: all 1D tensors must be F32")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if self.ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("MiniCPMForCausalLM")
|
@Model.register("MiniCPMForCausalLM")
|
||||||
class MiniCPMModel(Model):
|
class MiniCPMModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.MINICPM
|
model_arch = gguf.MODEL_ARCH.MINICPM
|
||||||
|
@ -1592,6 +1782,105 @@ class Qwen2Model(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.QWEN2
|
model_arch = gguf.MODEL_ARCH.QWEN2
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("Qwen2MoeForCausalLM")
|
||||||
|
class Qwen2MoeModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.QWEN2MOE
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
if (n_experts := self.hparams.get("num_experts")) is not None:
|
||||||
|
self.gguf_writer.add_expert_count(n_experts)
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
||||||
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||||
|
n_experts = self.hparams.get("num_experts")
|
||||||
|
experts = dict()
|
||||||
|
for name, data_torch in self.get_tensors():
|
||||||
|
# we don't need these
|
||||||
|
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
|
||||||
|
continue
|
||||||
|
|
||||||
|
old_dtype = data_torch.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||||
|
data_torch = data_torch.to(torch.float32)
|
||||||
|
|
||||||
|
data = data_torch.squeeze().numpy()
|
||||||
|
|
||||||
|
# process the experts separately
|
||||||
|
if name.find("experts") != -1:
|
||||||
|
experts[name] = data
|
||||||
|
if len(experts) >= n_experts * 3:
|
||||||
|
# merge the experts into a single 3d tensor
|
||||||
|
for bid in range(block_count):
|
||||||
|
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||||
|
full = True
|
||||||
|
for xid in range(n_experts):
|
||||||
|
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||||
|
if ename not in experts:
|
||||||
|
full = False
|
||||||
|
break
|
||||||
|
if not full:
|
||||||
|
continue
|
||||||
|
|
||||||
|
datas = []
|
||||||
|
for xid in range(n_experts):
|
||||||
|
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||||
|
datas.append(experts[ename])
|
||||||
|
del experts[ename]
|
||||||
|
|
||||||
|
data = np.stack(datas, axis=0)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
if self.ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||||
|
|
||||||
|
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if self.ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
if len(experts) > 0:
|
||||||
|
raise ValueError(f"Unprocessed experts: {experts.keys()}")
|
||||||
|
|
||||||
|
|
||||||
@Model.register("GPT2LMHeadModel")
|
@Model.register("GPT2LMHeadModel")
|
||||||
class GPT2Model(Model):
|
class GPT2Model(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.GPT2
|
model_arch = gguf.MODEL_ARCH.GPT2
|
||||||
|
@ -2043,34 +2332,25 @@ class BertModel(Model):
|
||||||
self.gguf_writer.add_pooling_type(pooling_type)
|
self.gguf_writer.add_pooling_type(pooling_type)
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
# use huggingface vocab to get all tokens
|
tokens, toktypes = self.get_basic_vocab()
|
||||||
vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
|
self.vocab_size = len(tokens)
|
||||||
tokens, scores, toktypes = zip(*vocab.all_tokens())
|
|
||||||
assert len(tokens) == vocab.vocab_size
|
|
||||||
self.vocab_size = vocab.vocab_size
|
|
||||||
|
|
||||||
# we need this to validate the size of the token_type embeddings
|
# we need this to validate the size of the token_type embeddings
|
||||||
# though currently we are passing all zeros to the token_type embeddings
|
# though currently we are passing all zeros to the token_type embeddings
|
||||||
n_token_types = len(set(toktypes))
|
self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B"
|
||||||
self.gguf_writer.add_token_type_count(n_token_types)
|
|
||||||
|
|
||||||
# convert to phantom space vocab
|
# convert to phantom space vocab
|
||||||
def phantom(tok, typ):
|
def phantom(tok):
|
||||||
if tok.startswith(b"[") and tok.endswith(b"]"):
|
if tok.startswith("[") and tok.endswith("]"):
|
||||||
return tok
|
return tok
|
||||||
if tok.startswith(b"##"):
|
if tok.startswith("##"):
|
||||||
return tok[2:]
|
return tok[2:]
|
||||||
return b"\xe2\x96\x81" + tok
|
return "\u2581" + tok
|
||||||
tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
|
tokens = list(map(phantom, tokens))
|
||||||
|
|
||||||
# set up bos and eos tokens (cls and sep)
|
|
||||||
self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
|
|
||||||
self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id)
|
|
||||||
|
|
||||||
# add vocab to gguf
|
# add vocab to gguf
|
||||||
self.gguf_writer.add_tokenizer_model("bert")
|
self.gguf_writer.add_tokenizer_model("bert")
|
||||||
self.gguf_writer.add_token_list(tokens)
|
self.gguf_writer.add_token_list(tokens)
|
||||||
self.gguf_writer.add_token_scores(scores)
|
|
||||||
self.gguf_writer.add_token_types(toktypes)
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
# handle special tokens
|
# handle special tokens
|
||||||
|
@ -2142,16 +2422,6 @@ class NomicBertModel(BertModel):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
||||||
|
|
||||||
def get_tensors(self):
|
|
||||||
assert self.vocab_size is not None
|
|
||||||
for name, data in super().get_tensors():
|
|
||||||
# Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
|
|
||||||
if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
|
|
||||||
rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
|
|
||||||
assert data.shape == (rounded_vocab_size, self.hparams["n_embd"])
|
|
||||||
data = data[:self.vocab_size, :]
|
|
||||||
yield name, data
|
|
||||||
|
|
||||||
|
|
||||||
@Model.register("GemmaForCausalLM")
|
@Model.register("GemmaForCausalLM")
|
||||||
class GemmaModel(Model):
|
class GemmaModel(Model):
|
||||||
|
@ -2159,6 +2429,13 @@ class GemmaModel(Model):
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
self._set_vocab_sentencepiece()
|
self._set_vocab_sentencepiece()
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
|
||||||
|
special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
|
||||||
|
special_vocab._set_special_token("prefix", 67)
|
||||||
|
special_vocab._set_special_token("suffix", 69)
|
||||||
|
special_vocab._set_special_token("middle", 68)
|
||||||
|
special_vocab._set_special_token("eot", 70)
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
|
@ -2181,6 +2458,12 @@ class GemmaModel(Model):
|
||||||
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||||
|
|
||||||
for name, data_torch in self.get_tensors():
|
for name, data_torch in self.get_tensors():
|
||||||
|
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model
|
||||||
|
# To prevent errors, skip loading lm_head.weight.
|
||||||
|
if name == "lm_head.weight":
|
||||||
|
print(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
||||||
|
continue
|
||||||
|
|
||||||
old_dtype = data_torch.dtype
|
old_dtype = data_torch.dtype
|
||||||
|
|
||||||
# convert any unsupported data types to float32
|
# convert any unsupported data types to float32
|
||||||
|
@ -2327,7 +2610,8 @@ class MambaModel(Model):
|
||||||
data = data.astype(np.float32)
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
# if f16 desired, convert big float32 2-dim weight tensors to float16
|
# if f16 desired, convert big float32 2-dim weight tensors to float16
|
||||||
if self.ftype == 1 and data_dtype == np.float32 and new_name.removesuffix(".weight").endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
|
new_weight_name = new_name[:-len(".weight")] if new_name.endswith(".weight") else ""
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
|
||||||
data = data.astype(np.float16)
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||||
|
@ -2378,6 +2662,7 @@ def parse_args() -> argparse.Namespace:
|
||||||
"model", type=Path,
|
"model", type=Path,
|
||||||
help="directory containing model file",
|
help="directory containing model file",
|
||||||
)
|
)
|
||||||
|
parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)")
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
@ -2421,7 +2706,7 @@ def main() -> None:
|
||||||
|
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
|
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file)
|
||||||
|
|
||||||
print("Set model parameters")
|
print("Set model parameters")
|
||||||
model_instance.set_gguf_parameters()
|
model_instance.set_gguf_parameters()
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
26
convert.py
26
convert.py
|
@ -33,7 +33,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from typing import TypeAlias
|
from typing_extensions import Self, TypeAlias
|
||||||
|
|
||||||
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
|
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
|
||||||
faulthandler.register(signal.SIGUSR1)
|
faulthandler.register(signal.SIGUSR1)
|
||||||
|
@ -139,7 +139,8 @@ class GGMLFileType(enum.IntEnum):
|
||||||
dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
|
dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
|
||||||
if dt is None:
|
if dt is None:
|
||||||
raise ValueError(self)
|
raise ValueError(self)
|
||||||
# 1D tensors are always F32.
|
# Convert all 1D tensors to F32. Most of the codebase that takes in 1D tensors only handles F32 tensors, and most of the outputs tensors are F32.
|
||||||
|
# Also The 1d tensors aren't much of a performance/size issue. So instead of having to have separate F32 and F16 implementations of both, just convert everything to F32 for now.
|
||||||
return dt if len(tensor.shape) > 1 else DT_F32
|
return dt if len(tensor.shape) > 1 else DT_F32
|
||||||
|
|
||||||
|
|
||||||
|
@ -516,7 +517,7 @@ class LlamaHfVocab(Vocab):
|
||||||
tokenizer_model = "llama"
|
tokenizer_model = "llama"
|
||||||
name = "hfft"
|
name = "hfft"
|
||||||
|
|
||||||
def __init__(self, base_path: Path, ignore_nonllama: bool = False):
|
def __init__(self, base_path: Path):
|
||||||
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
||||||
# if this fails, FileNotFoundError propagates to caller
|
# if this fails, FileNotFoundError propagates to caller
|
||||||
with open(fname_tokenizer, encoding='utf-8') as f:
|
with open(fname_tokenizer, encoding='utf-8') as f:
|
||||||
|
@ -524,9 +525,7 @@ class LlamaHfVocab(Vocab):
|
||||||
|
|
||||||
# pre-check so we know if we need transformers
|
# pre-check so we know if we need transformers
|
||||||
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
||||||
if ignore_nonllama:
|
if (
|
||||||
pass # workaround incorrect use of this class for WordPiece
|
|
||||||
elif (
|
|
||||||
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
|
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
|
||||||
or tokenizer_json['decoder']['type'] != 'Sequence'
|
or tokenizer_json['decoder']['type'] != 'Sequence'
|
||||||
):
|
):
|
||||||
|
@ -646,16 +645,17 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
||||||
|
|
||||||
|
|
||||||
class Tensor(ABC):
|
class Tensor(ABC):
|
||||||
|
ndarray: NDArray
|
||||||
data_type: DataType
|
data_type: DataType
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def astype(self, data_type: DataType) -> Tensor: ...
|
def astype(self, data_type: DataType) -> Self: ...
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
|
def permute(self, n_head: int, n_head_kv: int) -> Self: ...
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
|
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> Self: ...
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def part(self, n_part: int) -> UnquantizedTensor: ...
|
def part(self, n_part: int) -> Self: ...
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def to_ggml(self) -> GGMLCompatibleTensor: ...
|
def to_ggml(self) -> GGMLCompatibleTensor: ...
|
||||||
|
|
||||||
|
@ -672,13 +672,13 @@ class UnquantizedTensor(Tensor):
|
||||||
self.ndarray = ndarray
|
self.ndarray = ndarray
|
||||||
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
|
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
|
||||||
|
|
||||||
def astype(self, data_type: DataType) -> Tensor:
|
def astype(self, data_type: DataType) -> UnquantizedTensor:
|
||||||
dtype = data_type.dtype
|
dtype = data_type.dtype
|
||||||
if self.data_type == DT_BF16:
|
if self.data_type == DT_BF16:
|
||||||
self.ndarray = bf16_to_fp32(self.ndarray)
|
self.ndarray = bf16_to_fp32(self.ndarray)
|
||||||
return UnquantizedTensor(self.ndarray.astype(dtype))
|
return UnquantizedTensor(self.ndarray.astype(dtype))
|
||||||
|
|
||||||
def to_ggml(self) -> UnquantizedTensor:
|
def to_ggml(self) -> Self:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
|
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
|
||||||
|
@ -1350,7 +1350,7 @@ def load_some_model(path: Path) -> ModelPlus:
|
||||||
# Be extra-friendly and accept either a file or a directory:
|
# Be extra-friendly and accept either a file or a directory:
|
||||||
if path.is_dir():
|
if path.is_dir():
|
||||||
# Check if it's a set of safetensors files first
|
# Check if it's a set of safetensors files first
|
||||||
globs = ["model-00001-of-*.safetensors", "model.safetensors"]
|
globs = ["model-00001-of-*.safetensors", "model.safetensors", "consolidated.safetensors"]
|
||||||
files = [file for glob in globs for file in path.glob(glob)]
|
files = [file for glob in globs for file in path.glob(glob)]
|
||||||
if not files:
|
if not files:
|
||||||
# Try the PyTorch patterns too, with lower priority
|
# Try the PyTorch patterns too, with lower priority
|
||||||
|
|
119
docs/HOWTO-add-model.md
Normal file
119
docs/HOWTO-add-model.md
Normal file
|
@ -0,0 +1,119 @@
|
||||||
|
## Add a new model architecture to `llama.cpp`
|
||||||
|
|
||||||
|
Adding a model requires few steps:
|
||||||
|
|
||||||
|
1. Convert the model to GGUF
|
||||||
|
2. Define the model architecture in `llama.cpp`
|
||||||
|
3. Build the GGML graph implementation
|
||||||
|
|
||||||
|
After following these steps, you can open PR.
|
||||||
|
|
||||||
|
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
|
||||||
|
- [main](../examples/main)
|
||||||
|
- [imatrix](../examples/imatrix)
|
||||||
|
- [quantize](../examples/quantize)
|
||||||
|
- [server](../examples/server)
|
||||||
|
|
||||||
|
### 1. Convert the model to GGUF
|
||||||
|
|
||||||
|
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
|
||||||
|
Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
|
||||||
|
|
||||||
|
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
|
||||||
|
|
||||||
|
The required steps to implement for an HF model are:
|
||||||
|
|
||||||
|
1. Define the model `Model.register` annotation in a new `Model` subclass, example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
@Model.register("MyModelForCausalLM")
|
||||||
|
class MyModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.GROK
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py)
|
||||||
|
|
||||||
|
Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.
|
||||||
|
|
||||||
|
Example for `falcon` model:
|
||||||
|
```python
|
||||||
|
MODEL_ARCH.FALCON: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_NORM_2,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Map the original tensor names to the standardize equivalent in GGUF
|
||||||
|
|
||||||
|
As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.
|
||||||
|
|
||||||
|
Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file.
|
||||||
|
|
||||||
|
If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.
|
||||||
|
|
||||||
|
Example for the normalization tensor in attention layers:
|
||||||
|
|
||||||
|
```python
|
||||||
|
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||||
|
# Attention norm
|
||||||
|
MODEL_TENSOR.ATTN_NORM: (
|
||||||
|
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
||||||
|
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen
|
||||||
|
"transformer.blocks.{bid}.norm_1", # mpt
|
||||||
|
...
|
||||||
|
)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF.
|
||||||
|
|
||||||
|
Depending on the model configuration, tokenizer, code and tensors layout, you will have to override:
|
||||||
|
- `Model#set_gguf_parameters`
|
||||||
|
- `Model#set_vocab`
|
||||||
|
- `Model#write_tensors`
|
||||||
|
|
||||||
|
NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights.
|
||||||
|
|
||||||
|
### 2. Define the model architecture in `llama.cpp`
|
||||||
|
|
||||||
|
The model params and tensors layout must be defined in `llama.cpp`:
|
||||||
|
1. Define a new `llm_arch`
|
||||||
|
2. Define the tensors layout in `LLM_TENSOR_NAMES`
|
||||||
|
3. Add any non standard metadata in `llm_load_hparams`
|
||||||
|
4. Create the tensors for inference in `llm_load_tensors`
|
||||||
|
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
|
||||||
|
|
||||||
|
NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
|
||||||
|
|
||||||
|
### 3. Build the GGML graph implementation
|
||||||
|
|
||||||
|
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
|
||||||
|
|
||||||
|
Have a look to existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
|
||||||
|
|
||||||
|
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.
|
||||||
|
|
||||||
|
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
|
||||||
|
|
||||||
|
## GGUF specification
|
||||||
|
|
||||||
|
https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
|
||||||
|
|
||||||
|
## Resources
|
||||||
|
|
||||||
|
- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
|
||||||
|
- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
|
||||||
|
- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
|
||||||
|
- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
|
||||||
|
- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
|
||||||
|
- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
|
||||||
|
- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
|
||||||
|
- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
|
||||||
|
- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948
|
|
@ -19,6 +19,7 @@ else()
|
||||||
add_subdirectory(benchmark)
|
add_subdirectory(benchmark)
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
|
add_subdirectory(eval-callback)
|
||||||
add_subdirectory(finetune)
|
add_subdirectory(finetune)
|
||||||
add_subdirectory(gritlm)
|
add_subdirectory(gritlm)
|
||||||
add_subdirectory(gguf-split)
|
add_subdirectory(gguf-split)
|
||||||
|
|
|
@ -10,16 +10,16 @@ There are 2 modes of operation:
|
||||||
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
|
./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
|
||||||
|
|
||||||
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
||||||
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
|
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
|
||||||
|
|
||||||
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
||||||
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
|
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
|
||||||
|
|
||||||
# custom set of batches
|
# custom set of batches
|
||||||
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
|
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
|
||||||
```
|
```
|
||||||
|
|
||||||
## Sample results
|
## Sample results
|
||||||
|
|
|
@ -32,13 +32,15 @@ int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (argc == 1 || argv[1][0] == '-') {
|
if (argc == 1 || argv[1][0] == '-') {
|
||||||
printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
|
printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
|
||||||
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
|
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
|
||||||
printf(" example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
|
printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
|
||||||
return 1 ;
|
return 1 ;
|
||||||
}
|
}
|
||||||
|
|
||||||
int n_kv_max = 2048;
|
int n_kv_max = 2048;
|
||||||
|
int n_batch = 2048;
|
||||||
|
int n_ubatch = 512;
|
||||||
int is_pp_shared = 0;
|
int is_pp_shared = 0;
|
||||||
int n_gpu_layers = 0;
|
int n_gpu_layers = 0;
|
||||||
|
|
||||||
|
@ -56,23 +58,31 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argc >= 4) {
|
if (argc >= 4) {
|
||||||
is_pp_shared = std::atoi(argv[3]);
|
n_batch = std::atoi(argv[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argc >= 5) {
|
if (argc >= 5) {
|
||||||
n_gpu_layers = std::atoi(argv[4]);
|
n_ubatch = std::atoi(argv[4]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argc >= 6) {
|
if (argc >= 6) {
|
||||||
n_pp = parse_list(argv[5]);
|
is_pp_shared = std::atoi(argv[5]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argc >= 7) {
|
if (argc >= 7) {
|
||||||
n_tg = parse_list(argv[6]);
|
n_gpu_layers = std::atoi(argv[6]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argc >= 8) {
|
if (argc >= 8) {
|
||||||
n_pl = parse_list(argv[7]);
|
n_pp = parse_list(argv[7]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 9) {
|
||||||
|
n_tg = parse_list(argv[8]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 10) {
|
||||||
|
n_pl = parse_list(argv[9]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// init LLM
|
// init LLM
|
||||||
|
@ -100,7 +110,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
ctx_params.seed = 1234;
|
ctx_params.seed = 1234;
|
||||||
ctx_params.n_ctx = n_kv_max;
|
ctx_params.n_ctx = n_kv_max;
|
||||||
ctx_params.n_batch = 2048;
|
ctx_params.n_batch = n_batch;
|
||||||
|
ctx_params.n_ubatch = n_ubatch;
|
||||||
|
|
||||||
ctx_params.n_threads = params.n_threads;
|
ctx_params.n_threads = params.n_threads;
|
||||||
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||||
|
@ -158,7 +169,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||||
|
|
|
@ -123,10 +123,10 @@ int main(int argc, char ** argv) {
|
||||||
inputs.push_back(inp);
|
inputs.push_back(inp);
|
||||||
}
|
}
|
||||||
|
|
||||||
// add eos if not present
|
// add SEP if not present
|
||||||
for (auto & inp : inputs) {
|
for (auto & inp : inputs) {
|
||||||
if (inp.empty() || inp.back() != llama_token_eos(model)) {
|
if (inp.empty() || inp.back() != llama_token_sep(model)) {
|
||||||
inp.push_back(llama_token_eos(model));
|
inp.push_back(llama_token_sep(model));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
9
examples/eval-callback/CMakeLists.txt
Normal file
9
examples/eval-callback/CMakeLists.txt
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
set(TARGET eval-callback)
|
||||||
|
add_executable(${TARGET} eval-callback.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
||||||
|
set(TEST_TARGET test-eval-callback)
|
||||||
|
add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
||||||
|
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
95
examples/eval-callback/README.md
Normal file
95
examples/eval-callback/README.md
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
# llama.cpp/examples/eval-callback
|
||||||
|
|
||||||
|
A simple example which demonstrates how to use callback during the inference.
|
||||||
|
It simply prints to the console all operations and tensor data.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
eval-callback \
|
||||||
|
--hf-repo ggml-org/models \
|
||||||
|
--hf-file phi-2/ggml-model-q4_0.gguf \
|
||||||
|
--model phi-2-q4_0.gguf \
|
||||||
|
--prompt hello \
|
||||||
|
--seed 42 \
|
||||||
|
-ngl 33
|
||||||
|
```
|
||||||
|
|
||||||
|
Will print:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
llm_load_tensors: offloaded 33/33 layers to GPU
|
||||||
|
...
|
||||||
|
llama_new_context_with_model: n_ctx = 512
|
||||||
|
...
|
||||||
|
llama_new_context_with_model: CUDA0 compute buffer size = 105.00 MiB
|
||||||
|
llama_new_context_with_model: CUDA_Host compute buffer size = 6.01 MiB
|
||||||
|
llama_new_context_with_model: graph nodes = 1225
|
||||||
|
llama_new_context_with_model: graph splits = 2
|
||||||
|
ggml_debug: inp_embd = (f32) GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -0.0181, 0.0272, 0.0272, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: norm-0 = (f32) NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -0.6989, 1.0636, 1.0636, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: norm_w-0 = (f32) MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -0.1800, 0.2817, 0.2632, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: attn_norm-0 = (f32) ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -0.1863, 0.2970, 0.2604, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: wqkv-0 = (f32) MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -1.1238, 1.2876, -1.8086, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: bqkv-0 = (f32) ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -1.1135, 1.4604, -1.9226, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: bqkv-0 (view) = (f32) VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -1.1135, 1.4604, -1.9226, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: Qcur-0 = (f32) CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -1.1135, 1.4604, -1.9226, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: Qcur-0 (reshaped) = (f32) RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -1.1135, 1.4604, -1.9226, ...],
|
||||||
|
[ -0.3608, 0.5076, -1.8866, ...],
|
||||||
|
[ 1.7643, 0.0273, -2.1065, ...],
|
||||||
|
...
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: Qcur-0 = (f32) ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -1.1135, 1.4604, -1.9226, ...],
|
||||||
|
[ -0.3608, 0.5076, -1.8866, ...],
|
||||||
|
[ 1.7643, 0.0273, -2.1065, ...],
|
||||||
|
...
|
||||||
|
],
|
||||||
|
]
|
||||||
|
```
|
195
examples/eval-callback/eval-callback.cpp
Normal file
195
examples/eval-callback/eval-callback.cpp
Normal file
|
@ -0,0 +1,195 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <random>
|
||||||
|
#include <string>
|
||||||
|
#include <tuple>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This the arbitrary data which will be passed to each callback.
|
||||||
|
* Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
|
||||||
|
*/
|
||||||
|
struct callback_data {
|
||||||
|
std::vector<uint8_t> data;
|
||||||
|
};
|
||||||
|
|
||||||
|
static std::string ggml_ne_string(const ggml_tensor * t) {
|
||||||
|
std::string str;
|
||||||
|
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||||
|
str += std::to_string(t->ne[i]);
|
||||||
|
if (i + 1 < GGML_MAX_DIMS) {
|
||||||
|
str += ", ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
|
||||||
|
GGML_ASSERT(n > 0);
|
||||||
|
float sum = 0;
|
||||||
|
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||||
|
printf(" [\n");
|
||||||
|
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
||||||
|
if (i2 == n && ne[2] > 2*n) {
|
||||||
|
printf(" ..., \n");
|
||||||
|
i2 = ne[2] - n;
|
||||||
|
}
|
||||||
|
printf(" [\n");
|
||||||
|
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
||||||
|
if (i1 == n && ne[1] > 2*n) {
|
||||||
|
printf(" ..., \n");
|
||||||
|
i1 = ne[1] - n;
|
||||||
|
}
|
||||||
|
printf(" [");
|
||||||
|
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
||||||
|
if (i0 == n && ne[0] > 2*n) {
|
||||||
|
printf("..., ");
|
||||||
|
i0 = ne[0] - n;
|
||||||
|
}
|
||||||
|
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||||
|
float v;
|
||||||
|
if (type == GGML_TYPE_F16) {
|
||||||
|
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
|
||||||
|
} else if (type == GGML_TYPE_F32) {
|
||||||
|
v = *(float *) data + i;
|
||||||
|
} else if (type == GGML_TYPE_I32) {
|
||||||
|
v = (float) *(int32_t *) data + i;
|
||||||
|
} else if (type == GGML_TYPE_I16) {
|
||||||
|
v = (float) *(int16_t *) data + i;
|
||||||
|
} else if (type == GGML_TYPE_I8) {
|
||||||
|
v = (float) *(int8_t *) data + i;
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
printf("%12.4f", v);
|
||||||
|
sum += v;
|
||||||
|
if (i0 < ne[0] - 1) printf(", ");
|
||||||
|
}
|
||||||
|
printf("],\n");
|
||||||
|
}
|
||||||
|
printf(" ],\n");
|
||||||
|
}
|
||||||
|
printf(" ]\n");
|
||||||
|
printf(" sum = %f\n", sum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GGML operations callback during the graph execution.
|
||||||
|
*
|
||||||
|
* @param t current tensor
|
||||||
|
* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
||||||
|
* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
|
||||||
|
* see ggml_backend_sched_eval_callback
|
||||||
|
* @param user_data user data to pass at each call back
|
||||||
|
* @return true to receive data or continue the graph, false otherwise
|
||||||
|
*/
|
||||||
|
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||||
|
auto * cb_data = (callback_data *) user_data;
|
||||||
|
|
||||||
|
const struct ggml_tensor * src0 = t->src[0];
|
||||||
|
const struct ggml_tensor * src1 = t->src[1];
|
||||||
|
|
||||||
|
if (ask) {
|
||||||
|
return true; // Always retrieve data
|
||||||
|
}
|
||||||
|
|
||||||
|
char src1_str[128] = {0};
|
||||||
|
if (src1) {
|
||||||
|
sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
|
||||||
|
t->name, ggml_type_name(t->type), ggml_op_desc(t),
|
||||||
|
src0->name, ggml_ne_string(src0).c_str(),
|
||||||
|
src1 ? src1_str : "",
|
||||||
|
ggml_ne_string(t).c_str());
|
||||||
|
|
||||||
|
|
||||||
|
// copy the data from the GPU memory if needed
|
||||||
|
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
|
||||||
|
|
||||||
|
if (!is_host) {
|
||||||
|
auto n_bytes = ggml_nbytes(t);
|
||||||
|
cb_data->data.resize(n_bytes);
|
||||||
|
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ggml_is_quantized(t->type)) {
|
||||||
|
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
|
||||||
|
ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool run(llama_context * ctx, const gpt_params & params) {
|
||||||
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
|
||||||
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
|
|
||||||
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
|
||||||
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
|
||||||
|
callback_data cb_data;
|
||||||
|
|
||||||
|
gpt_params params;
|
||||||
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
print_build_info();
|
||||||
|
|
||||||
|
std::mt19937 rng(params.seed);
|
||||||
|
if (params.random_prompt) {
|
||||||
|
params.prompt = gpt_random_prompt(rng);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_backend_init();
|
||||||
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
|
// pass the callback to the backend scheduler
|
||||||
|
// it will be executed for each node during the graph computation
|
||||||
|
params.cb_eval = ggml_debug;
|
||||||
|
params.cb_eval_user_data = &cb_data;
|
||||||
|
params.warmup = false;
|
||||||
|
|
||||||
|
// init
|
||||||
|
llama_model * model;
|
||||||
|
llama_context * ctx;
|
||||||
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
if (model == nullptr || ctx == nullptr) {
|
||||||
|
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// print system information
|
||||||
|
{
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
bool OK = run(ctx, params);
|
||||||
|
if (!OK) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_print_timings(ctx);
|
||||||
|
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -17,7 +17,7 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
|
||||||
size_t pos = 0;
|
size_t pos = 0;
|
||||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||||
auto prev_stacks = grammar->stacks;
|
auto prev_stacks = grammar->stacks;
|
||||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
|
||||||
if (grammar->stacks.empty()) {
|
if (grammar->stacks.empty()) {
|
||||||
error_pos = pos;
|
error_pos = pos;
|
||||||
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
|
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
|
||||||
|
|
|
@ -5,5 +5,6 @@ CLI to split / merge GGUF files.
|
||||||
**Command line options:**
|
**Command line options:**
|
||||||
|
|
||||||
- `--split`: split GGUF to multiple GGUF, default operation.
|
- `--split`: split GGUF to multiple GGUF, default operation.
|
||||||
|
- `--split-max-size`: max size per split in `M` or `G`, f.ex. `500M` or `2G`.
|
||||||
- `--split-max-tensors`: maximum tensors in each split: default(128)
|
- `--split-max-tensors`: maximum tensors in each split: default(128)
|
||||||
- `--merge`: merge multiple GGUF to a single GGUF.
|
- `--merge`: merge multiple GGUF to a single GGUF.
|
||||||
|
|
|
@ -59,10 +59,10 @@ static size_t split_str_to_n_bytes(std::string str) {
|
||||||
int n;
|
int n;
|
||||||
if (str.back() == 'M') {
|
if (str.back() == 'M') {
|
||||||
sscanf(str.c_str(), "%d", &n);
|
sscanf(str.c_str(), "%d", &n);
|
||||||
n_bytes = n * 1024 * 1024; // megabytes
|
n_bytes = (size_t)n * 1024 * 1024; // megabytes
|
||||||
} else if (str.back() == 'G') {
|
} else if (str.back() == 'G') {
|
||||||
sscanf(str.c_str(), "%d", &n);
|
sscanf(str.c_str(), "%d", &n);
|
||||||
n_bytes = n * 1024 * 1024 * 1024; // gigabytes
|
n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes
|
||||||
} else {
|
} else {
|
||||||
throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
|
throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
|
||||||
}
|
}
|
||||||
|
|
89
examples/gguf-split/tests.sh
Normal file
89
examples/gguf-split/tests.sh
Normal file
|
@ -0,0 +1,89 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
if [ $# -lt 1 ]
|
||||||
|
then
|
||||||
|
echo "usage: $0 path_to_build_binary [path_to_temp_folder]"
|
||||||
|
echo "example: $0 ../../build/bin ../../tmp"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $# -gt 1 ]
|
||||||
|
then
|
||||||
|
TMP_DIR=$2
|
||||||
|
else
|
||||||
|
TMP_DIR=/tmp
|
||||||
|
fi
|
||||||
|
|
||||||
|
set -x
|
||||||
|
|
||||||
|
SPLIT=$1/gguf-split
|
||||||
|
MAIN=$1/main
|
||||||
|
WORK_PATH=$TMP_DIR/gguf-split
|
||||||
|
CUR_DIR=$(pwd)
|
||||||
|
|
||||||
|
mkdir -p "$WORK_PATH"
|
||||||
|
|
||||||
|
# Clean up in case of previously failed test
|
||||||
|
rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
|
||||||
|
|
||||||
|
# 1. Get a model
|
||||||
|
(
|
||||||
|
cd $WORK_PATH
|
||||||
|
"$CUR_DIR"/../../scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
|
||||||
|
)
|
||||||
|
echo PASS
|
||||||
|
|
||||||
|
# 2. Split with max tensors strategy
|
||||||
|
$SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
|
||||||
|
echo PASS
|
||||||
|
echo
|
||||||
|
|
||||||
|
# 2b. Test the sharded model is loading properly
|
||||||
|
$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32
|
||||||
|
echo PASS
|
||||||
|
echo
|
||||||
|
|
||||||
|
# 3. Merge
|
||||||
|
$SPLIT --merge $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-merge.gguf
|
||||||
|
echo PASS
|
||||||
|
echo
|
||||||
|
|
||||||
|
# 3b. Test the merged model is loading properly
|
||||||
|
$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
|
||||||
|
echo PASS
|
||||||
|
echo
|
||||||
|
|
||||||
|
# 4. Split with no tensor in metadata
|
||||||
|
#$SPLIT --split-max-tensors 32 --no-tensor-in-metadata $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
|
||||||
|
#echo PASS
|
||||||
|
#echo
|
||||||
|
|
||||||
|
# 4b. Test the sharded model is loading properly
|
||||||
|
#$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf --random-prompt --n-predict 32
|
||||||
|
#echo PASS
|
||||||
|
#echo
|
||||||
|
|
||||||
|
# 5. Merge
|
||||||
|
#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf
|
||||||
|
#echo PASS
|
||||||
|
#echo
|
||||||
|
|
||||||
|
# 5b. Test the merged model is loading properly
|
||||||
|
#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32
|
||||||
|
#echo PASS
|
||||||
|
#echo
|
||||||
|
|
||||||
|
# 6. Split with size strategy
|
||||||
|
$SPLIT --split-max-size 2G $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-2G
|
||||||
|
echo PASS
|
||||||
|
echo
|
||||||
|
|
||||||
|
# 6b. Test the sharded model is loading properly
|
||||||
|
$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32
|
||||||
|
echo PASS
|
||||||
|
echo
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
|
|
@ -142,7 +142,7 @@ static bool gguf_ex_read_0(const std::string & fname) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// read and create ggml_context containing the tensors and their data
|
// read and create ggml_context containing the tensors and their data
|
||||||
static bool gguf_ex_read_1(const std::string & fname) {
|
static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
||||||
struct ggml_context * ctx_data = NULL;
|
struct ggml_context * ctx_data = NULL;
|
||||||
|
|
||||||
struct gguf_init_params params = {
|
struct gguf_init_params params = {
|
||||||
|
@ -206,7 +206,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
|
||||||
printf("\n\n");
|
printf("\n\n");
|
||||||
|
|
||||||
// check data
|
// check data
|
||||||
{
|
if (check_data) {
|
||||||
const float * data = (const float *) cur->data;
|
const float * data = (const float *) cur->data;
|
||||||
for (int j = 0; j < ggml_nelements(cur); ++j) {
|
for (int j = 0; j < ggml_nelements(cur); ++j) {
|
||||||
if (data[j] != 100 + i) {
|
if (data[j] != 100 + i) {
|
||||||
|
@ -229,9 +229,16 @@ static bool gguf_ex_read_1(const std::string & fname) {
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
if (argc < 3) {
|
if (argc < 3) {
|
||||||
printf("usage: %s data.gguf r|w\n", argv[0]);
|
printf("usage: %s data.gguf r|w [n]\n", argv[0]);
|
||||||
|
printf("r: read data.gguf file\n");
|
||||||
|
printf("w: write data.gguf file\n");
|
||||||
|
printf("n: no check of tensor data\n");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
bool check_data = true;
|
||||||
|
if (argc == 4) {
|
||||||
|
check_data = false;
|
||||||
|
}
|
||||||
|
|
||||||
const std::string fname(argv[1]);
|
const std::string fname(argv[1]);
|
||||||
const std::string mode (argv[2]);
|
const std::string mode (argv[2]);
|
||||||
|
@ -242,7 +249,7 @@ int main(int argc, char ** argv) {
|
||||||
GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
|
GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
|
||||||
} else if (mode == "r") {
|
} else if (mode == "r") {
|
||||||
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
|
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
|
||||||
GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
|
GGML_ASSERT(gguf_ex_read_1(fname, check_data) && "failed to read gguf file");
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -21,12 +21,12 @@ not have to be performed at all.
|
||||||
### Running the example
|
### Running the example
|
||||||
Download a Grit model:
|
Download a Grit model:
|
||||||
```console
|
```console
|
||||||
$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf
|
$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --outdir models
|
||||||
```
|
```
|
||||||
|
|
||||||
Run the example using the downloaded model:
|
Run the example using the downloaded model:
|
||||||
```console
|
```console
|
||||||
$ ./gritlm -m gritlm-7b_q4_1.gguf
|
$ ./gritlm -m models/gritlm-7b_q4_1.gguf
|
||||||
|
|
||||||
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
|
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
|
||||||
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
|
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
|
||||||
|
|
|
@ -107,9 +107,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
|
|
||||||
// the top-k selected expert ids are stored in the ids tensor
|
// the top-k selected expert ids are stored in the ids tensor
|
||||||
// for simplicity, always copy ids to host, because it is small
|
// for simplicity, always copy ids to host, because it is small
|
||||||
// take into account that ids is not contiguous!
|
|
||||||
GGML_ASSERT(ids->ne[1] == src1->ne[1]);
|
GGML_ASSERT(ids->ne[1] == src1->ne[1]);
|
||||||
GGML_ASSERT(n_as*ggml_nrows(ids)*sizeof(int) == GGML_PAD(ggml_nbytes(ids), n_as*sizeof(int)));
|
|
||||||
m_ids.resize(ggml_nbytes(ids)/sizeof(int));
|
m_ids.resize(ggml_nbytes(ids)/sizeof(int));
|
||||||
ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
|
ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
|
||||||
|
|
||||||
|
@ -349,12 +347,13 @@ static void process_logits(
|
||||||
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
|
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||||
|
@ -596,24 +595,18 @@ int main(int argc, char ** argv) {
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
llama_model_params mparams = llama_model_params_from_gpt_params(params);
|
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
|
|
||||||
if (model == NULL) {
|
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
|
||||||
|
|
||||||
// pass the callback to the backend scheduler
|
// pass the callback to the backend scheduler
|
||||||
// it will be executed for each node during the graph computation
|
// it will be executed for each node during the graph computation
|
||||||
cparams.cb_eval = ik_collect_imatrix;
|
params.cb_eval = ik_collect_imatrix;
|
||||||
cparams.cb_eval_user_data = NULL;
|
params.cb_eval_user_data = NULL;
|
||||||
|
params.warmup = false;
|
||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(model, cparams);
|
// init
|
||||||
if (ctx == NULL) {
|
llama_model * model;
|
||||||
fprintf(stderr, "%s: error: unable to create context\n", __func__);
|
llama_context * ctx;
|
||||||
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
if (model == nullptr || ctx == nullptr) {
|
||||||
|
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -36,6 +36,11 @@ The `infill` program offers a seamless way to interact with LLaMA models, allowi
|
||||||
|
|
||||||
### Example
|
### Example
|
||||||
|
|
||||||
|
Download a model that supports infill, for example CodeLlama:
|
||||||
|
```console
|
||||||
|
scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
|
||||||
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
|
./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
|
||||||
```
|
```
|
||||||
|
|
|
@ -239,6 +239,7 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("%s\n", get_system_info(params).c_str());
|
LOG_TEE("%s\n", get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
|
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||||
LOG("add_bos: %d\n", add_bos);
|
LOG("add_bos: %d\n", add_bos);
|
||||||
|
|
||||||
bool suff_rm_leading_spc = params.escape;
|
bool suff_rm_leading_spc = params.escape;
|
||||||
|
@ -279,10 +280,10 @@ int main(int argc, char ** argv) {
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
||||||
|
|
||||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
|
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
|
||||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
||||||
|
|
||||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
||||||
|
|
||||||
original_prompt_len = original_inp.size();
|
original_prompt_len = original_inp.size();
|
||||||
|
|
|
@ -6,37 +6,94 @@ import re
|
||||||
import sys
|
import sys
|
||||||
from typing import Any, Dict, List, Set, Tuple, Union
|
from typing import Any, Dict, List, Set, Tuple, Union
|
||||||
|
|
||||||
|
def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
|
||||||
|
if not separator_rule:
|
||||||
|
if min_items == 0 and max_items == 1:
|
||||||
|
return f'{item_rule}?'
|
||||||
|
elif min_items == 1 and max_items is None:
|
||||||
|
return f'{item_rule}+'
|
||||||
|
|
||||||
|
result = ''
|
||||||
|
|
||||||
|
if min_items > 0:
|
||||||
|
if item_rule_is_literal and separator_rule is None:
|
||||||
|
result = '"' + (item_rule[1:-1] * min_items) + '"'
|
||||||
|
else:
|
||||||
|
result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items)
|
||||||
|
|
||||||
|
def opt_repetitions(up_to_n, prefix_with_sep=False):
|
||||||
|
'''
|
||||||
|
- n=4, no sep: '(a (a (a (a)?)?)?)?'
|
||||||
|
- n=4, sep=',', prefix: '("," a ("," a ("," a ("," a)?)?)?)?'
|
||||||
|
- n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
|
||||||
|
'''
|
||||||
|
|
||||||
|
content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
|
||||||
|
if up_to_n == 0:
|
||||||
|
return ''
|
||||||
|
elif up_to_n == 1:
|
||||||
|
return f'({content})?'
|
||||||
|
elif separator_rule and not prefix_with_sep:
|
||||||
|
return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
|
||||||
|
else:
|
||||||
|
return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
|
||||||
|
|
||||||
|
if min_items > 0 and max_items != min_items:
|
||||||
|
result += ' '
|
||||||
|
|
||||||
|
if max_items is not None:
|
||||||
|
result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
|
||||||
|
else:
|
||||||
|
item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
|
||||||
|
|
||||||
|
if min_items == 0 and separator_rule:
|
||||||
|
result = f'({item_rule} {item_operator}*)?'
|
||||||
|
else:
|
||||||
|
result += f'{item_operator}*'
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
class BuiltinRule:
|
||||||
|
def __init__(self, content: str, deps: list = None):
|
||||||
|
self.content = content
|
||||||
|
self.deps = deps or []
|
||||||
|
|
||||||
|
_up_to_15_digits = _build_repetition('[0-9]', 0, 15)
|
||||||
|
|
||||||
# whitespace is constrained to a single space char to prevent model "running away" in
|
# whitespace is constrained to a single space char to prevent model "running away" in
|
||||||
# whitespace. Also maybe improves generation quality?
|
# whitespace. Also maybe improves generation quality?
|
||||||
SPACE_RULE = '" "?'
|
SPACE_RULE = '" "?'
|
||||||
|
|
||||||
PRIMITIVE_RULES = {
|
PRIMITIVE_RULES = {
|
||||||
'boolean': '("true" | "false") space',
|
'boolean' : BuiltinRule('("true" | "false") space', []),
|
||||||
'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
|
'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []),
|
||||||
'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
|
'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
|
||||||
'value' : 'object | array | string | number | boolean',
|
'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
|
||||||
'object' : '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
|
'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']),
|
||||||
'array' : '"[" space ( value ("," space value)* )? "]" space',
|
'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
|
||||||
'uuid' : '"\\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + ' "\\"" space',
|
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
|
||||||
'string': r''' "\"" (
|
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
|
||||||
[^"\\] |
|
'uuid' : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []),
|
||||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []),
|
||||||
)* "\"" space''',
|
'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
|
||||||
'null': '"null" space',
|
'null' : BuiltinRule('"null" space', []),
|
||||||
}
|
}
|
||||||
OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value']
|
|
||||||
|
|
||||||
# TODO: support "uri", "email" string formats
|
# TODO: support "uri", "email" string formats
|
||||||
DATE_RULES = {
|
STRING_FORMAT_RULES = {
|
||||||
'date' : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
|
'date' : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
|
||||||
'time' : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
|
'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
|
||||||
'date-time': 'date "T" time',
|
'date-time' : BuiltinRule('date "T" time', ['date', 'time']),
|
||||||
'date-string': '"\\"" date "\\"" space',
|
'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']),
|
||||||
'time-string': '"\\"" time "\\"" space',
|
'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']),
|
||||||
'date-time-string': '"\\"" date-time "\\"" space',
|
'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
|
||||||
}
|
}
|
||||||
|
|
||||||
RESERVED_NAMES = set(["root", *PRIMITIVE_RULES.keys(), *DATE_RULES.keys()])
|
DOTALL = '[\\U00000000-\\U0010FFFF]'
|
||||||
|
DOT = '[^\\x0A\\x0D]'
|
||||||
|
|
||||||
|
RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()])
|
||||||
|
|
||||||
INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
|
INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
|
||||||
GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
|
GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
|
||||||
|
@ -46,8 +103,6 @@ GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']'
|
||||||
NON_LITERAL_SET = set('|.()[]{}*+?')
|
NON_LITERAL_SET = set('|.()[]{}*+?')
|
||||||
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
|
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
|
||||||
|
|
||||||
DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])'
|
|
||||||
TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits
|
|
||||||
|
|
||||||
class SchemaConverter:
|
class SchemaConverter:
|
||||||
def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
|
def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
|
||||||
|
@ -55,7 +110,9 @@ class SchemaConverter:
|
||||||
self._allow_fetch = allow_fetch
|
self._allow_fetch = allow_fetch
|
||||||
self._dotall = dotall
|
self._dotall = dotall
|
||||||
self._raw_pattern = raw_pattern
|
self._raw_pattern = raw_pattern
|
||||||
self._rules = {'space': SPACE_RULE}
|
self._rules = {
|
||||||
|
'space': SPACE_RULE,
|
||||||
|
}
|
||||||
self._refs = {}
|
self._refs = {}
|
||||||
self._refs_being_resolved = set()
|
self._refs_being_resolved = set()
|
||||||
|
|
||||||
|
@ -65,6 +122,29 @@ class SchemaConverter:
|
||||||
)
|
)
|
||||||
return f'"{escaped}"'
|
return f'"{escaped}"'
|
||||||
|
|
||||||
|
def not_literal(self, literal: str, dotall: bool = True, maybe_escaped_underscores = False) -> str:
|
||||||
|
'''
|
||||||
|
not_literal('a') -> '[^a]'
|
||||||
|
not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?'
|
||||||
|
'''
|
||||||
|
assert len(literal) > 0, 'Empty literal not supported'
|
||||||
|
def recurse(i: int):
|
||||||
|
c = literal[i]
|
||||||
|
if maybe_escaped_underscores and c == '_':
|
||||||
|
yield f'[^{c}\\\\]'
|
||||||
|
yield ' | '
|
||||||
|
yield f'"\\\\"? "{c}"'
|
||||||
|
else:
|
||||||
|
yield f'[^{c}]'
|
||||||
|
if i < len(literal) - 1:
|
||||||
|
yield ' | '
|
||||||
|
yield self._format_literal(c)
|
||||||
|
yield ' ('
|
||||||
|
yield from recurse(i + 1)
|
||||||
|
yield ')?'
|
||||||
|
|
||||||
|
return ''.join(('(', *recurse(0), ')'))
|
||||||
|
|
||||||
def _add_rule(self, name, rule):
|
def _add_rule(self, name, rule):
|
||||||
esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
|
esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
|
||||||
if esc_name not in self._rules or self._rules[esc_name] == rule:
|
if esc_name not in self._rules or self._rules[esc_name] == rule:
|
||||||
|
@ -169,10 +249,10 @@ class SchemaConverter:
|
||||||
|
|
||||||
def get_dot():
|
def get_dot():
|
||||||
if self._dotall:
|
if self._dotall:
|
||||||
rule = '[\\U00000000-\\U0010FFFF]'
|
rule = DOTALL
|
||||||
else:
|
else:
|
||||||
# Accept any character... except \n and \r line break chars (\x0A and \xOD)
|
# Accept any character... except \n and \r line break chars (\x0A and \xOD)
|
||||||
rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]'
|
rule = DOT
|
||||||
return self._add_rule(f'dot', rule)
|
return self._add_rule(f'dot', rule)
|
||||||
|
|
||||||
def join_seq():
|
def join_seq():
|
||||||
|
@ -246,13 +326,6 @@ class SchemaConverter:
|
||||||
|
|
||||||
(sub, sub_is_literal) = seq[-1]
|
(sub, sub_is_literal) = seq[-1]
|
||||||
|
|
||||||
if min_times == 0 and max_times is None:
|
|
||||||
seq[-1] = (f'{sub}*', False)
|
|
||||||
elif min_times == 0 and max_times == 1:
|
|
||||||
seq[-1] = (f'{sub}?', False)
|
|
||||||
elif min_times == 1 and max_times is None:
|
|
||||||
seq[-1] = (f'{sub}+', False)
|
|
||||||
else:
|
|
||||||
if not sub_is_literal:
|
if not sub_is_literal:
|
||||||
id = sub_rule_ids.get(sub)
|
id = sub_rule_ids.get(sub)
|
||||||
if id is None:
|
if id is None:
|
||||||
|
@ -260,12 +333,7 @@ class SchemaConverter:
|
||||||
sub_rule_ids[sub] = id
|
sub_rule_ids[sub] = id
|
||||||
sub = id
|
sub = id
|
||||||
|
|
||||||
seq[-1] = (
|
seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False)
|
||||||
' '.join(
|
|
||||||
([f'"{sub[1:-1] * min_times}"'] if sub_is_literal else [sub] * min_times) +
|
|
||||||
([f'{sub}?'] * (max_times - min_times) if max_times is not None else [f'{sub}*'])),
|
|
||||||
False
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
literal = ''
|
literal = ''
|
||||||
while i < length:
|
while i < length:
|
||||||
|
@ -373,49 +441,47 @@ class SchemaConverter:
|
||||||
' "]" space')
|
' "]" space')
|
||||||
else:
|
else:
|
||||||
item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
|
item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
|
||||||
list_item_operator = f'( "," space {item_rule_name} )'
|
|
||||||
successive_items = ""
|
|
||||||
min_items = schema.get("minItems", 0)
|
min_items = schema.get("minItems", 0)
|
||||||
max_items = schema.get("maxItems")
|
max_items = schema.get("maxItems")
|
||||||
if min_items > 0:
|
return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space')
|
||||||
successive_items = list_item_operator * (min_items - 1)
|
|
||||||
min_items -= 1
|
|
||||||
if max_items is not None and max_items > min_items:
|
|
||||||
successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
|
|
||||||
else:
|
|
||||||
successive_items += list_item_operator + "*"
|
|
||||||
if min_items == 0:
|
|
||||||
rule = f'"[" space ( {item_rule_name} {successive_items} )? "]" space'
|
|
||||||
else:
|
|
||||||
rule = f'"[" space {item_rule_name} {successive_items} "]" space'
|
|
||||||
return self._add_rule(rule_name, rule)
|
|
||||||
|
|
||||||
elif schema_type in (None, 'string') and 'pattern' in schema:
|
elif schema_type in (None, 'string') and 'pattern' in schema:
|
||||||
return self._visit_pattern(schema['pattern'], rule_name)
|
return self._visit_pattern(schema['pattern'], rule_name)
|
||||||
|
|
||||||
elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''):
|
elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''):
|
||||||
return self._add_rule(
|
return self._add_primitive(
|
||||||
'root' if rule_name == 'root' else schema_format,
|
'root' if rule_name == 'root' else schema_format,
|
||||||
PRIMITIVE_RULES['uuid']
|
PRIMITIVE_RULES['uuid']
|
||||||
)
|
)
|
||||||
|
|
||||||
elif schema_type in (None, 'string') and schema_format in DATE_RULES:
|
elif schema_type in (None, 'string') and f'{schema_format}-string' in STRING_FORMAT_RULES:
|
||||||
for t, r in DATE_RULES.items():
|
prim_name = f'{schema_format}-string'
|
||||||
self._add_rule(t, r)
|
return self._add_rule(rule_name, self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name]))
|
||||||
return schema_format + '-string'
|
|
||||||
|
elif schema_type == 'string' and ('minLength' in schema or 'maxLength' in schema):
|
||||||
|
char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
|
||||||
|
min_len = schema.get('minLength', 0)
|
||||||
|
max_len = schema.get('maxLength')
|
||||||
|
|
||||||
|
return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
|
||||||
|
|
||||||
elif (schema_type == 'object') or (len(schema) == 0):
|
elif (schema_type == 'object') or (len(schema) == 0):
|
||||||
for n in OBJECT_RULE_NAMES:
|
return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
|
||||||
self._add_rule(n, PRIMITIVE_RULES[n])
|
|
||||||
return self._add_rule(rule_name, 'object')
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
|
assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
|
||||||
# TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
|
# TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
|
||||||
return self._add_rule(
|
return self._add_primitive('root' if rule_name == 'root' else schema_type, PRIMITIVE_RULES[schema_type])
|
||||||
'root' if rule_name == 'root' else schema_type,
|
|
||||||
PRIMITIVE_RULES[schema_type]
|
def _add_primitive(self, name: str, rule: BuiltinRule):
|
||||||
)
|
n = self._add_rule(name, rule.content)
|
||||||
|
|
||||||
|
for dep in rule.deps:
|
||||||
|
dep_rule = PRIMITIVE_RULES.get(dep) or STRING_FORMAT_RULES.get(dep)
|
||||||
|
assert dep_rule, f'Rule {dep} not known'
|
||||||
|
if dep not in self._rules:
|
||||||
|
self._add_primitive(dep, dep_rule)
|
||||||
|
return n
|
||||||
|
|
||||||
def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
|
def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
|
||||||
prop_order = self._prop_order
|
prop_order = self._prop_order
|
||||||
|
@ -437,7 +503,7 @@ class SchemaConverter:
|
||||||
value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
|
value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
|
||||||
prop_kv_rule_names["*"] = self._add_rule(
|
prop_kv_rule_names["*"] = self._add_rule(
|
||||||
f'{sub_name}-kv',
|
f'{sub_name}-kv',
|
||||||
self._add_rule('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
|
self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
|
||||||
)
|
)
|
||||||
optional_props.append("*")
|
optional_props.append("*")
|
||||||
|
|
|
@ -190,7 +190,7 @@ static const cmd_params cmd_params_defaults = {
|
||||||
/* n_ubatch */ {512},
|
/* n_ubatch */ {512},
|
||||||
/* type_k */ {GGML_TYPE_F16},
|
/* type_k */ {GGML_TYPE_F16},
|
||||||
/* type_v */ {GGML_TYPE_F16},
|
/* type_v */ {GGML_TYPE_F16},
|
||||||
/* n_threads */ {get_num_physical_cores()},
|
/* n_threads */ {get_math_cpu_count()},
|
||||||
/* n_gpu_layers */ {99},
|
/* n_gpu_layers */ {99},
|
||||||
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
||||||
/* main_gpu */ {0},
|
/* main_gpu */ {0},
|
||||||
|
|
|
@ -22,7 +22,7 @@ After building, run: `./llava-cli` to see the usage. For example:
|
||||||
|
|
||||||
## Model conversion
|
## Model conversion
|
||||||
|
|
||||||
- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
|
1. Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
git clone https://huggingface.co/mtgv/MobileVLM-1.7B
|
git clone https://huggingface.co/mtgv/MobileVLM-1.7B
|
||||||
|
|
|
@ -24,7 +24,7 @@ After building, run: `./llava-cli` to see the usage. For example:
|
||||||
|
|
||||||
## LLaVA 1.5
|
## LLaVA 1.5
|
||||||
|
|
||||||
- Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
|
1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
|
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
|
||||||
|
|
|
@ -146,7 +146,6 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
|
|
||||||
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
|
|
||||||
|
|
||||||
std::string system_prompt, user_prompt;
|
std::string system_prompt, user_prompt;
|
||||||
size_t image_pos = prompt.find("<image>");
|
size_t image_pos = prompt.find("<image>");
|
||||||
|
@ -180,7 +179,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
|
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
|
||||||
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
|
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
|
||||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||||
|
|
||||||
|
|
|
@ -64,13 +64,10 @@ int main(int argc, char ** argv) {
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
// Tokenize the prompt
|
// Tokenize the prompt
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
|
||||||
LOG("add_bos tgt: %d\n", add_bos);
|
|
||||||
|
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
std::vector<llama_token> all;
|
std::vector<llama_token> all;
|
||||||
|
|
||||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
all = inp;
|
all = inp;
|
||||||
|
|
||||||
const int max_context_size = llama_n_ctx(ctx);
|
const int max_context_size = llama_n_ctx(ctx);
|
||||||
|
|
|
@ -28,10 +28,8 @@ int main(int argc, char ** argv){
|
||||||
GGML_ASSERT(model != nullptr);
|
GGML_ASSERT(model != nullptr);
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
|
||||||
|
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
fprintf(stderr, "%s: tokenization done\n", __func__);
|
fprintf(stderr, "%s: tokenization done\n", __func__);
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -34,11 +34,8 @@ int main(int argc, char ** argv){
|
||||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
|
||||||
LOG("add_bos tgt: %d\n", add_bos);
|
|
||||||
|
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
|
|
||||||
llama_ngram_cache ngram_cache_context;
|
llama_ngram_cache ngram_cache_context;
|
||||||
llama_ngram_cache ngram_cache_dynamic;
|
llama_ngram_cache ngram_cache_dynamic;
|
||||||
|
|
|
@ -42,11 +42,8 @@ int main(int argc, char ** argv){
|
||||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
|
||||||
LOG("add_bos tgt: %d\n", add_bos);
|
|
||||||
|
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
|
|
||||||
llama_ngram_cache ngram_cache_context;
|
llama_ngram_cache ngram_cache_context;
|
||||||
llama_ngram_cache ngram_cache_dynamic;
|
llama_ngram_cache ngram_cache_dynamic;
|
||||||
|
|
|
@ -304,13 +304,15 @@ These options help improve the performance and memory usage of the LLaMA models.
|
||||||
|
|
||||||
- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
|
- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
|
||||||
|
|
||||||
### Grammars
|
### Grammars & JSON schemas
|
||||||
|
|
||||||
- `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax.
|
- `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax.
|
||||||
|
|
||||||
|
- `--json-schema SCHEMA`: Specify a [JSON schema](https://json-schema.org/) to constrain model output to (e.g. `{}` for any JSON object, or `{"items": {"type": "string", "minLength": 10, "maxLength": 100}, "minItems": 10}` for a JSON array of strings with size constraints). If a schema uses external `$ref`s, you should use `--grammar "$( python examples/json_schema_to_grammar.py myschema.json )"` instead.
|
||||||
|
|
||||||
### Quantization
|
### Quantization
|
||||||
|
|
||||||
For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
|
For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize).
|
||||||
|
|
||||||
## Additional Options
|
## Additional Options
|
||||||
|
|
||||||
|
|
|
@ -235,7 +235,7 @@ int main(int argc, char ** argv) {
|
||||||
// The file exists and is not empty
|
// The file exists and is not empty
|
||||||
session_tokens.resize(n_ctx);
|
session_tokens.resize(n_ctx);
|
||||||
size_t n_token_count_out = 0;
|
size_t n_token_count_out = 0;
|
||||||
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
||||||
LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
|
LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -246,6 +246,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
|
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||||
LOG("add_bos: %d\n", add_bos);
|
LOG("add_bos: %d\n", add_bos);
|
||||||
|
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
|
@ -255,7 +256,7 @@ int main(int argc, char ** argv) {
|
||||||
if (params.chatml) {
|
if (params.chatml) {
|
||||||
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
|
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
|
||||||
}
|
}
|
||||||
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
} else {
|
} else {
|
||||||
LOG("use session tokens\n");
|
LOG("use session tokens\n");
|
||||||
embd_inp = session_tokens;
|
embd_inp = session_tokens;
|
||||||
|
@ -277,10 +278,10 @@ int main(int argc, char ** argv) {
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
||||||
|
|
||||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
|
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
|
||||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
||||||
|
|
||||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
||||||
|
|
||||||
original_prompt_len = original_inp.size();
|
original_prompt_len = original_inp.size();
|
||||||
|
@ -339,14 +340,14 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// prefix & suffix for instruct mode
|
// prefix & suffix for instruct mode
|
||||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
|
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true);
|
||||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
|
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
|
||||||
|
|
||||||
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
||||||
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
||||||
|
|
||||||
// chatml prefix & suffix
|
// chatml prefix & suffix
|
||||||
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true);
|
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
|
||||||
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
|
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
|
||||||
|
|
||||||
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
|
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
|
||||||
|
@ -693,7 +694,7 @@ int main(int argc, char ** argv) {
|
||||||
// optionally save the session on first sample (for faster prompt loading next time)
|
// optionally save the session on first sample (for faster prompt loading next time)
|
||||||
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
|
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
|
||||||
need_to_save_session = false;
|
need_to_save_session = false;
|
||||||
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||||
|
|
||||||
LOG("saved session to %s\n", path_session.c_str());
|
LOG("saved session to %s\n", path_session.c_str());
|
||||||
}
|
}
|
||||||
|
@ -935,7 +936,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
|
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
|
||||||
LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
|
LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
|
||||||
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
|
|
|
@ -3,19 +3,18 @@
|
||||||
TODO
|
TODO
|
||||||
|
|
||||||
## Llama 2 70B Scorechart
|
## Llama 2 70B Scorechart
|
||||||
Quantization | Model size (GiB) | Perplexity | Delta to fp16
|
| Quantization | Model size (GiB) | Perplexity | Delta to fp16 |
|
||||||
-- | -- | -- | --
|
|--------------|------------------|------------|---------------|
|
||||||
Q4_0 | 36.20 | 3.5550 | 3.61%
|
| Q4_0 | 36.20 | 3.5550 | 3.61% |
|
||||||
Q4_1 | 40.20 | 3.5125 | 2.37%
|
| Q4_1 | 40.20 | 3.5125 | 2.37% |
|
||||||
Q5_0 | 44.20 | 3.4744 | 1.26%
|
| Q5_0 | 44.20 | 3.4744 | 1.26% |
|
||||||
Q2_K | 27.27 | 3.7339 | 8.82%
|
| Q2_K | 27.27 | 3.7339 | 8.82% |
|
||||||
Q3_K_S | 27.86 | 3.7019 | 7.89%
|
| Q3_K_S | 27.86 | 3.7019 | 7.89% |
|
||||||
Q3_K_M | 30.83 | 3.5932 | 4.72%
|
| Q3_K_M | 30.83 | 3.5932 | 4.72% |
|
||||||
Q3_K_L | 33.67 | 3.5617 | 3.80%
|
| Q3_K_L | 33.67 | 3.5617 | 3.80% |
|
||||||
Q4_K_S | 36.39 | 3.4852 | 1.57%
|
| Q4_K_S | 36.39 | 3.4852 | 1.57% |
|
||||||
Q4_K_M | 38.54 | 3.4725 | 1.20%
|
| Q4_K_M | 38.54 | 3.4725 | 1.20% |
|
||||||
Q5_K_S | 44.20 | 3.4483 | 0.50%
|
| Q5_K_S | 44.20 | 3.4483 | 0.50% |
|
||||||
Q5_K_M | 45.41 | 3.4451 | 0.40%
|
| Q5_K_M | 45.41 | 3.4451 | 0.40% |
|
||||||
Q6_K | 52.70 | 3.4367 | 0.16%
|
| Q6_K | 52.70 | 3.4367 | 0.16% |
|
||||||
fp16 | 128.5 | 3.4313 | -
|
| fp16 | 128.5 | 3.4313 | - |
|
||||||
|
|
||||||
|
|
|
@ -315,10 +315,11 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||||
|
|
||||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
|
@ -454,6 +455,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||||
|
|
||||||
std::ofstream logits_stream;
|
std::ofstream logits_stream;
|
||||||
if (!params.logits_file.empty()) {
|
if (!params.logits_file.empty()) {
|
||||||
|
@ -470,7 +472,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||||
|
@ -771,9 +773,6 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
|
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
|
||||||
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
|
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
|
||||||
|
|
||||||
// This is needed as usual for LLaMA models
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
|
||||||
|
|
||||||
// The tasks should be randomized so the score stabilizes quickly.
|
// The tasks should be randomized so the score stabilizes quickly.
|
||||||
bool randomize_tasks = true;
|
bool randomize_tasks = true;
|
||||||
|
|
||||||
|
@ -818,7 +817,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
||||||
for (size_t j = 0; j < 4; j++) {
|
for (size_t j = 0; j < 4; j++) {
|
||||||
hs_cur.ending[j] = prompt_lines[idx*6+2+j];
|
hs_cur.ending[j] = prompt_lines[idx*6+2+j];
|
||||||
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], add_bos);
|
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// determine the common prefix of the endings
|
// determine the common prefix of the endings
|
||||||
|
@ -837,7 +836,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
|
hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
|
||||||
hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
|
hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
|
||||||
|
|
||||||
//GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, add_bos).size());
|
//GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, true).size());
|
||||||
|
|
||||||
// Delete the selected random example from the prompt
|
// Delete the selected random example from the prompt
|
||||||
if (randomize_tasks) {
|
if (randomize_tasks) {
|
||||||
|
@ -1110,12 +1109,9 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||||
|
|
||||||
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
|
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
|
||||||
|
|
||||||
// This is needed as usual for LLaMA models
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
|
||||||
|
|
||||||
for (auto & task : data) {
|
for (auto & task : data) {
|
||||||
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, add_bos);
|
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
|
||||||
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, add_bos);
|
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true);
|
||||||
|
|
||||||
task.common_prefix = 0;
|
task.common_prefix = 0;
|
||||||
for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
|
for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
|
||||||
|
@ -1130,8 +1126,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||||
task.seq_tokens[0].size() - task.common_prefix +
|
task.seq_tokens[0].size() - task.common_prefix +
|
||||||
task.seq_tokens[1].size() - task.common_prefix;
|
task.seq_tokens[1].size() - task.common_prefix;
|
||||||
|
|
||||||
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], add_bos).size();
|
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size();
|
||||||
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], add_bos).size();
|
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
|
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
|
||||||
|
@ -1322,7 +1318,7 @@ struct multiple_choice_task {
|
||||||
std::vector<float> log_probs;
|
std::vector<float> log_probs;
|
||||||
};
|
};
|
||||||
|
|
||||||
static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) {
|
static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
|
||||||
if (task.question.empty() || task.mc1.answers.empty()) {
|
if (task.question.empty() || task.mc1.answers.empty()) {
|
||||||
if (log_error) {
|
if (log_error) {
|
||||||
printf("%s: found bad task with empty question and/or answers\n", __func__);
|
printf("%s: found bad task with empty question and/or answers\n", __func__);
|
||||||
|
@ -1337,7 +1333,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos,
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos));
|
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true));
|
||||||
}
|
}
|
||||||
auto min_len = task.seq_tokens.front().size();
|
auto min_len = task.seq_tokens.front().size();
|
||||||
for (auto& seq : task.seq_tokens) {
|
for (auto& seq : task.seq_tokens) {
|
||||||
|
@ -1436,9 +1432,6 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||||
n_task = params.multiple_choice_tasks;
|
n_task = params.multiple_choice_tasks;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is needed as usual for LLaMA models
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
|
||||||
|
|
||||||
printf("%s: preparing task data", __func__);
|
printf("%s: preparing task data", __func__);
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
if (n_task > 500) {
|
if (n_task > 500) {
|
||||||
|
@ -1446,7 +1439,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
std::atomic<int> counter(0);
|
std::atomic<int> counter(0);
|
||||||
std::atomic<int> n_bad(0);
|
std::atomic<int> n_bad(0);
|
||||||
auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () {
|
auto prepare = [&counter, &n_bad, &tasks, ctx] () {
|
||||||
int num_tasks = tasks.size();
|
int num_tasks = tasks.size();
|
||||||
int n_bad_local = 0;
|
int n_bad_local = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -1457,7 +1450,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||||
}
|
}
|
||||||
int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
|
int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
|
||||||
for (int i = first; i < last; ++i) {
|
for (int i = first; i < last; ++i) {
|
||||||
if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local;
|
if (!multiple_choice_prepare_one_task(ctx, tasks[i], false)) ++n_bad_local;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -1479,7 +1472,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||||
int i_task = 0;
|
int i_task = 0;
|
||||||
for (auto& task : tasks) {
|
for (auto& task : tasks) {
|
||||||
++i_task;
|
++i_task;
|
||||||
if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) {
|
if (!multiple_choice_prepare_one_task(ctx, task, true)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (i_task%n_dot == 0) {
|
if (i_task%n_dot == 0) {
|
||||||
|
@ -1715,6 +1708,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||||
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
|
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
|
||||||
const int nv = 2*((n_vocab + 1)/2) + 4;
|
const int nv = 2*((n_vocab + 1)/2) + 4;
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||||
|
|
||||||
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
|
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
|
||||||
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
|
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
|
||||||
|
@ -1858,12 +1852,20 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const int32_t n_ctx = params.n_ctx;
|
const int32_t n_ctx = params.n_ctx;
|
||||||
|
|
||||||
|
if (n_ctx <= 0) {
|
||||||
|
fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence;
|
const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence;
|
||||||
|
|
||||||
if (ppl) {
|
if (ppl) {
|
||||||
int n_seq = std::max(1, params.n_batch / n_ctx);
|
const int32_t n_seq = std::max(1, params.n_batch / n_ctx);
|
||||||
int32_t n_kv = n_seq * n_ctx;
|
const int32_t n_kv = n_seq * n_ctx;
|
||||||
|
|
||||||
params.n_parallel = n_seq;
|
params.n_parallel = n_seq;
|
||||||
params.n_ctx = n_kv;
|
params.n_ctx = n_kv;
|
||||||
|
|
||||||
params.n_batch = std::min(params.n_batch, n_kv);
|
params.n_batch = std::min(params.n_batch, n_kv);
|
||||||
} else {
|
} else {
|
||||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||||
|
|
|
@ -4,17 +4,17 @@ TODO
|
||||||
|
|
||||||
## Llama 2 7B
|
## Llama 2 7B
|
||||||
|
|
||||||
Quantization | Bits per Weight (BPW)
|
| Quantization | Bits per Weight (BPW) |
|
||||||
-- | --
|
|--------------|-----------------------|
|
||||||
Q2_K | 3.35
|
| Q2_K | 3.35 |
|
||||||
Q3_K_S | 3.50
|
| Q3_K_S | 3.50 |
|
||||||
Q3_K_M | 3.91
|
| Q3_K_M | 3.91 |
|
||||||
Q3_K_L | 4.27
|
| Q3_K_L | 4.27 |
|
||||||
Q4_K_S | 4.58
|
| Q4_K_S | 4.58 |
|
||||||
Q4_K_M | 4.84
|
| Q4_K_M | 4.84 |
|
||||||
Q5_K_S | 5.52
|
| Q5_K_S | 5.52 |
|
||||||
Q5_K_M | 5.68
|
| Q5_K_M | 5.68 |
|
||||||
Q6_K | 6.56
|
| Q6_K | 6.56 |
|
||||||
|
|
||||||
## Llama 2 13B
|
## Llama 2 13B
|
||||||
Quantization | Bits per Weight (BPW)
|
Quantization | Bits per Weight (BPW)
|
||||||
|
|
|
@ -8,7 +8,7 @@ print(subprocess.check_output(
|
||||||
"python",
|
"python",
|
||||||
os.path.join(
|
os.path.join(
|
||||||
os.path.dirname(os.path.realpath(__file__)),
|
os.path.dirname(os.path.realpath(__file__)),
|
||||||
"json-schema-to-grammar.py"),
|
"json_schema_to_grammar.py"),
|
||||||
*rest,
|
*rest,
|
||||||
"-",
|
"-",
|
||||||
"--raw-pattern",
|
"--raw-pattern",
|
||||||
|
|
|
@ -24,6 +24,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::string result0;
|
std::string result0;
|
||||||
std::string result1;
|
std::string result1;
|
||||||
|
std::string result2;
|
||||||
|
|
||||||
// init
|
// init
|
||||||
llama_model * model;
|
llama_model * model;
|
||||||
|
@ -44,8 +45,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// save state (rng, logits, embedding and kv_cache) to file
|
// save state (rng, logits, embedding and kv_cache) to file
|
||||||
{
|
{
|
||||||
std::vector<uint8_t> state_mem(llama_get_state_size(ctx));
|
std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
|
||||||
const size_t written = llama_copy_state_data(ctx, state_mem.data());
|
const size_t written = llama_state_get_data(ctx, state_mem.data());
|
||||||
|
|
||||||
FILE *fp_write = fopen("dump_state.bin", "wb");
|
FILE *fp_write = fopen("dump_state.bin", "wb");
|
||||||
fwrite(state_mem.data(), 1, written, fp_write);
|
fwrite(state_mem.data(), 1, written, fp_write);
|
||||||
|
@ -97,13 +98,13 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// load state (rng, logits, embedding and kv_cache) from file
|
// load state (rng, logits, embedding and kv_cache) from file
|
||||||
{
|
{
|
||||||
std::vector<uint8_t> state_mem(llama_get_state_size(ctx2));
|
std::vector<uint8_t> state_mem(llama_state_get_size(ctx2));
|
||||||
|
|
||||||
FILE * fp_read = fopen("dump_state.bin", "rb");
|
FILE * fp_read = fopen("dump_state.bin", "rb");
|
||||||
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
|
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
|
||||||
fclose(fp_read);
|
fclose(fp_read);
|
||||||
|
|
||||||
if (read != llama_set_state_data(ctx2, state_mem.data())) {
|
if (read != llama_state_set_data(ctx2, state_mem.data())) {
|
||||||
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
||||||
llama_free(ctx2);
|
llama_free(ctx2);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
@ -141,16 +142,104 @@ int main(int argc, char ** argv) {
|
||||||
n_past += 1;
|
n_past += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("\n");
|
printf("\n\n");
|
||||||
|
|
||||||
llama_free(ctx2);
|
llama_free(ctx2);
|
||||||
llama_free_model(model);
|
|
||||||
|
|
||||||
if (result0 != result1) {
|
if (result0 != result1) {
|
||||||
fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
|
fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// make new context
|
||||||
|
auto* ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
||||||
|
|
||||||
|
printf("\nsingle seq run: %s", params.prompt.c_str());
|
||||||
|
|
||||||
|
// load state (rng, logits, embedding and kv_cache) from file
|
||||||
|
{
|
||||||
|
std::vector<uint8_t> state_mem(llama_state_get_size(ctx3));
|
||||||
|
|
||||||
|
FILE * fp_read = fopen("dump_state.bin", "rb");
|
||||||
|
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
|
||||||
|
fclose(fp_read);
|
||||||
|
|
||||||
|
if (read != llama_state_set_data(ctx3, state_mem.data())) {
|
||||||
|
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
||||||
|
llama_free(ctx3);
|
||||||
|
llama_free_model(model);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
// restore state (last tokens)
|
||||||
|
n_past = n_past_saved;
|
||||||
|
|
||||||
|
// save seq 0 and load into seq 1
|
||||||
|
{
|
||||||
|
// save kv of seq 0
|
||||||
|
std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0));
|
||||||
|
const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), 0);
|
||||||
|
if (ncopy != seq_store.size()) {
|
||||||
|
fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
|
||||||
|
llama_free(ctx3);
|
||||||
|
llama_free_model(model);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
|
||||||
|
|
||||||
|
// erase whole kv
|
||||||
|
llama_kv_cache_clear(ctx3);
|
||||||
|
fprintf(stderr, "%s : kv cache cleared\n", __func__);
|
||||||
|
|
||||||
|
// restore kv into seq 1
|
||||||
|
const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), 1);
|
||||||
|
if (nset != seq_store.size()) {
|
||||||
|
fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
|
||||||
|
llama_free(ctx3);
|
||||||
|
llama_free_model(model);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
|
||||||
|
}
|
||||||
|
|
||||||
|
// third run with seq 1 instead of 0
|
||||||
|
for (auto i = 0; i < params.n_predict; i++) {
|
||||||
|
auto * logits = llama_get_logits(ctx3);
|
||||||
|
auto n_vocab = llama_n_vocab(model);
|
||||||
|
std::vector<llama_token_data> candidates;
|
||||||
|
candidates.reserve(n_vocab);
|
||||||
|
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
|
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||||
|
}
|
||||||
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||||
|
auto next_token = llama_sample_token(ctx3, &candidates_p);
|
||||||
|
auto next_token_str = llama_token_to_piece(ctx3, next_token);
|
||||||
|
|
||||||
|
printf("%s", next_token_str.c_str());
|
||||||
|
result2 += next_token_str;
|
||||||
|
|
||||||
|
if (llama_decode(ctx3, llama_batch_get_one(&next_token, 1, n_past, 1))) {
|
||||||
|
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||||
|
llama_free(ctx3);
|
||||||
|
llama_free_model(model);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
n_past += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
llama_free(ctx3);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
|
if (result0 != result2) {
|
||||||
|
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
fprintf(stderr, "\n%s : success\n", __func__);
|
fprintf(stderr, "\n%s : success\n", __func__);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -11,7 +11,7 @@ install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_compile_definitions(${TARGET} PRIVATE
|
target_compile_definitions(${TARGET} PRIVATE
|
||||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||||
)
|
)
|
||||||
target_link_libraries(${TARGET} PRIVATE common json-schema-to-grammar ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
|
||||||
if (LLAMA_SERVER_SSL)
|
if (LLAMA_SERVER_SSL)
|
||||||
find_package(OpenSSL REQUIRED)
|
find_package(OpenSSL REQUIRED)
|
||||||
target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
|
target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
|
||||||
|
|
|
@ -11,6 +11,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
|
||||||
* Continuous batching
|
* Continuous batching
|
||||||
* Multimodal (wip)
|
* Multimodal (wip)
|
||||||
* Monitoring endpoints
|
* Monitoring endpoints
|
||||||
|
* Schema-constrained JSON response format
|
||||||
|
|
||||||
The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
|
The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
|
||||||
|
|
||||||
|
@ -57,6 +58,7 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
|
||||||
- `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1`
|
- `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1`
|
||||||
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
|
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
|
||||||
- `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled
|
- `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled
|
||||||
|
- `--slot-save-path PATH`: Specifies the path where the state of slots (the prompt cache) can be stored. If not provided, the slot management endpoints will be disabled.
|
||||||
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||||
- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
|
- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
|
||||||
- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
|
- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
|
||||||
|
@ -249,6 +251,8 @@ node index.js
|
||||||
|
|
||||||
`grammar`: Set grammar for grammar-based sampling. Default: no grammar
|
`grammar`: Set grammar for grammar-based sampling. Default: no grammar
|
||||||
|
|
||||||
|
`json_schema`: Set a JSON schema for grammar-based sampling (e.g. `{"items": {"type": "string"}, "minItems": 10, "maxItems": 100}` of a list of strings, or `{}` for any JSON). See [tests](../../tests/test-json-schema-to-grammar.cpp) for supported features. Default: no JSON schema.
|
||||||
|
|
||||||
`seed`: Set the random number generator (RNG) seed. Default: `-1`, which is a random seed.
|
`seed`: Set the random number generator (RNG) seed. Default: `-1`, which is a random seed.
|
||||||
|
|
||||||
`ignore_eos`: Ignore end of stream token and continue generating. Default: `false`
|
`ignore_eos`: Ignore end of stream token and continue generating. Default: `false`
|
||||||
|
@ -364,6 +368,8 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||||
|
|
||||||
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
|
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
|
||||||
|
|
||||||
|
The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}`), similar to other OpenAI-inspired API providers.
|
||||||
|
|
||||||
*Examples:*
|
*Examples:*
|
||||||
|
|
||||||
You can use either Python `openai` library with appropriate checkpoints:
|
You can use either Python `openai` library with appropriate checkpoints:
|
||||||
|
@ -517,6 +523,57 @@ Available metrics:
|
||||||
- `llamacpp:requests_processing`: Number of requests processing.
|
- `llamacpp:requests_processing`: Number of requests processing.
|
||||||
- `llamacpp:requests_deferred`: Number of requests deferred.
|
- `llamacpp:requests_deferred`: Number of requests deferred.
|
||||||
|
|
||||||
|
- **POST** `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
|
||||||
|
|
||||||
|
*Options:*
|
||||||
|
|
||||||
|
`filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.
|
||||||
|
|
||||||
|
### Result JSON
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id_slot": 0,
|
||||||
|
"filename": "slot_save_file.bin",
|
||||||
|
"n_saved": 1745,
|
||||||
|
"n_written": 14309796,
|
||||||
|
"timings": {
|
||||||
|
"save_ms": 49.865
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
- **POST** `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file.
|
||||||
|
|
||||||
|
*Options:*
|
||||||
|
|
||||||
|
`filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.
|
||||||
|
|
||||||
|
### Result JSON
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id_slot": 0,
|
||||||
|
"filename": "slot_save_file.bin",
|
||||||
|
"n_restored": 1745,
|
||||||
|
"n_read": 14309796,
|
||||||
|
"timings": {
|
||||||
|
"restore_ms": 42.937
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
- **POST** `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot.
|
||||||
|
|
||||||
|
### Result JSON
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id_slot": 0,
|
||||||
|
"n_erased": 1745
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## More examples
|
## More examples
|
||||||
|
|
||||||
### Change system prompt on runtime
|
### Change system prompt on runtime
|
||||||
|
|
|
@ -2,13 +2,15 @@
|
||||||
|
|
||||||
Benchmark is using [k6](https://k6.io/).
|
Benchmark is using [k6](https://k6.io/).
|
||||||
|
|
||||||
##### Install k6
|
##### Install k6 and sse extension
|
||||||
|
|
||||||
Follow instruction from: https://k6.io/docs/get-started/installation/
|
SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
|
||||||
|
|
||||||
Example for ubuntu:
|
Example:
|
||||||
```shell
|
```shell
|
||||||
snap install k6
|
go install go.k6.io/xk6/cmd/xk6@latest
|
||||||
|
xk6 build master \
|
||||||
|
--with github.com/phymbert/xk6-sse
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Download a dataset
|
#### Download a dataset
|
||||||
|
@ -46,7 +48,7 @@ server --host localhost --port 8080 \
|
||||||
|
|
||||||
For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
|
For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
|
||||||
```shell
|
```shell
|
||||||
k6 run script.js --duration 10m --iterations 500 --vus 8
|
./k6 run script.js --duration 10m --iterations 500 --vus 8
|
||||||
```
|
```
|
||||||
|
|
||||||
The benchmark values can be overridden with:
|
The benchmark values can be overridden with:
|
||||||
|
@ -86,3 +88,33 @@ K6 metrics might be compared against [server metrics](../README.md), with:
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:8080/metrics
|
curl http://localhost:8080/metrics
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Using the CI python script
|
||||||
|
The `bench.py` script does several steps:
|
||||||
|
- start the server
|
||||||
|
- define good variable for k6
|
||||||
|
- run k6 script
|
||||||
|
- extract metrics from prometheus
|
||||||
|
|
||||||
|
It aims to be used in the CI, but you can run it manually:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \
|
||||||
|
--runner-label local \
|
||||||
|
--name local \
|
||||||
|
--branch `git rev-parse --abbrev-ref HEAD` \
|
||||||
|
--commit `git rev-parse HEAD` \
|
||||||
|
--scenario script.js \
|
||||||
|
--duration 5m \
|
||||||
|
--hf-repo ggml-org/models \
|
||||||
|
--hf-file phi-2/ggml-model-q4_0.gguf \
|
||||||
|
--model-path-prefix models \
|
||||||
|
--parallel 4 \
|
||||||
|
-ngl 33 \
|
||||||
|
--batch-size 2048 \
|
||||||
|
--ubatch-size 256 \
|
||||||
|
--ctx-size 4096 \
|
||||||
|
--n-prompts 200 \
|
||||||
|
--max-prompt-tokens 256 \
|
||||||
|
--max-tokens 256
|
||||||
|
```
|
||||||
|
|
|
@ -76,7 +76,6 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
data['metrics'][metric_name][metric_metric]=value
|
data['metrics'][metric_name][metric_metric]=value
|
||||||
github_env.write(
|
github_env.write(
|
||||||
f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
|
f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
|
||||||
token_seconds = data['metrics']['llamacpp_tokens_second']['avg']
|
|
||||||
iterations = data['root_group']['checks']['success completion']['passes']
|
iterations = data['root_group']['checks']['success completion']['passes']
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
|
@ -181,16 +180,16 @@ xychart-beta
|
||||||
bench_results = {
|
bench_results = {
|
||||||
"i": iterations,
|
"i": iterations,
|
||||||
"req": {
|
"req": {
|
||||||
"p90": round(data['metrics']["http_req_duration"]["p(90)"], 2),
|
"p95": round(data['metrics']["http_req_duration"]["p(95)"], 2),
|
||||||
"avg": round(data['metrics']["http_req_duration"]["avg"], 2),
|
"avg": round(data['metrics']["http_req_duration"]["avg"], 2),
|
||||||
},
|
},
|
||||||
"pp": {
|
"pp": {
|
||||||
"p90": round(data['metrics']["llamacpp_prompt_tokens"]["p(90)"], 2),
|
"p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
|
||||||
"avg": round(data['metrics']["llamacpp_prompt_tokens"]["avg"], 2),
|
"avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
|
||||||
"0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
|
"0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
|
||||||
},
|
},
|
||||||
"tg": {
|
"tg": {
|
||||||
"p90": round(data['metrics']["llamacpp_tokens_second"]["p(90)"], 2),
|
"p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
|
||||||
"avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
|
"avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
|
||||||
"0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
|
"0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
|
||||||
},
|
},
|
||||||
|
@ -206,7 +205,7 @@ xychart-beta
|
||||||
|
|
||||||
|
|
||||||
def start_benchmark(args):
|
def start_benchmark(args):
|
||||||
k6_path = 'k6'
|
k6_path = './k6'
|
||||||
if 'BENCH_K6_BIN_PATH' in os.environ:
|
if 'BENCH_K6_BIN_PATH' in os.environ:
|
||||||
k6_path = os.environ['BENCH_K6_BIN_PATH']
|
k6_path = os.environ['BENCH_K6_BIN_PATH']
|
||||||
k6_args = [
|
k6_args = [
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import http from 'k6/http'
|
import sse from 'k6/x/sse'
|
||||||
import {check, sleep} from 'k6'
|
import {check, sleep} from 'k6'
|
||||||
import {SharedArray} from 'k6/data'
|
import {SharedArray} from 'k6/data'
|
||||||
import {Counter, Rate, Trend} from 'k6/metrics'
|
import {Counter, Rate, Trend} from 'k6/metrics'
|
||||||
|
@ -53,7 +53,9 @@ const data = new SharedArray('conversations', function () {
|
||||||
|
|
||||||
const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
|
const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
|
||||||
const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
|
const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
|
||||||
|
|
||||||
const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
|
const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
|
||||||
|
const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
|
||||||
|
|
||||||
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
|
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
|
||||||
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
|
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
|
||||||
|
@ -86,36 +88,62 @@ export default function () {
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"model": model,
|
"model": model,
|
||||||
"stream": false,
|
"stream": true,
|
||||||
"seed": 42,
|
"seed": 42,
|
||||||
"max_tokens": max_tokens
|
"max_tokens": max_tokens
|
||||||
}
|
}
|
||||||
|
|
||||||
const body = JSON.stringify(payload)
|
const params = {method: 'POST', body: JSON.stringify(payload)};
|
||||||
|
|
||||||
let res = http.post(`${server_url}/chat/completions`, body, {
|
const startTime = new Date()
|
||||||
headers: {'Content-Type': 'application/json'},
|
let promptEvalEndTime = null
|
||||||
timeout: '300s'
|
let prompt_tokens = 0
|
||||||
|
let completions_tokens = 0
|
||||||
|
let finish_reason = null
|
||||||
|
const res = sse.open(`${server_url}/chat/completions`, params, function (client) {
|
||||||
|
client.on('event', function (event) {
|
||||||
|
if (promptEvalEndTime == null) {
|
||||||
|
promptEvalEndTime = new Date()
|
||||||
|
}
|
||||||
|
|
||||||
|
let chunk = JSON.parse(event.data)
|
||||||
|
let choice = chunk.choices[0]
|
||||||
|
if (choice.finish_reason) {
|
||||||
|
finish_reason = choice.finish_reason
|
||||||
|
}
|
||||||
|
|
||||||
|
if (chunk.usage) {
|
||||||
|
prompt_tokens = chunk.usage.prompt_tokens
|
||||||
|
llamacpp_prompt_tokens.add(prompt_tokens)
|
||||||
|
llamacpp_prompt_tokens_total_counter.add(prompt_tokens)
|
||||||
|
|
||||||
|
completions_tokens = chunk.usage.completion_tokens
|
||||||
|
llamacpp_completion_tokens.add(completions_tokens)
|
||||||
|
llamacpp_completion_tokens_total_counter.add(completions_tokens)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
client.on('error', function (e) {
|
||||||
|
console.log('An unexpected error occurred: ', e.error());
|
||||||
|
throw e;
|
||||||
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
check(res, {'success completion': (r) => r.status === 200})
|
check(res, {'success completion': (r) => r.status === 200})
|
||||||
|
|
||||||
if (res.status === 200) {
|
const endTime = new Date()
|
||||||
const completions = res.json()
|
|
||||||
|
|
||||||
llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
|
const promptEvalTime = promptEvalEndTime - startTime
|
||||||
llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)
|
if (promptEvalTime > 0) {
|
||||||
|
llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
|
||||||
llamacpp_completion_tokens.add(completions.usage.completion_tokens)
|
|
||||||
llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
|
|
||||||
|
|
||||||
llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
|
|
||||||
llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
|
|
||||||
|
|
||||||
llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3)
|
|
||||||
} else {
|
|
||||||
console.error(`response: ${res.body} request=${payload}`)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const completion_time = endTime - promptEvalEndTime
|
||||||
|
if (completions_tokens > 0 && completion_time > 0) {
|
||||||
|
llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
|
||||||
|
}
|
||||||
|
llamacpp_completions_truncated_rate.add(finish_reason === 'length')
|
||||||
|
llamacpp_completions_stop_rate.add(finish_reason === 'stop')
|
||||||
|
|
||||||
sleep(0.3)
|
sleep(0.3)
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -406,7 +406,7 @@
|
||||||
throw new Error("already running");
|
throw new Error("already running");
|
||||||
}
|
}
|
||||||
controller.value = new AbortController();
|
controller.value = new AbortController();
|
||||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: document.baseURI.replace(/\/+$/, '') })) {
|
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
|
||||||
const data = chunk.data;
|
const data = chunk.data;
|
||||||
|
|
||||||
if (data.stop) {
|
if (data.stop) {
|
||||||
|
@ -1015,6 +1015,10 @@
|
||||||
}
|
}
|
||||||
|
|
||||||
function App(props) {
|
function App(props) {
|
||||||
|
useEffect(() => {
|
||||||
|
const query = new URLSearchParams(location.search).get("q");
|
||||||
|
if (query) chat(query);
|
||||||
|
}, []);
|
||||||
|
|
||||||
return html`
|
return html`
|
||||||
<div class="mode-${session.value.type}">
|
<div class="mode-${session.value.type}">
|
||||||
|
|
|
@ -1,33 +1,95 @@
|
||||||
// WARNING: This file was ported from json-schema-to-grammar.py, please fix bugs / add features there first.
|
// WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
|
||||||
const SPACE_RULE = '" "?';
|
const SPACE_RULE = '" "?';
|
||||||
|
|
||||||
const PRIMITIVE_RULES = {
|
function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
|
||||||
boolean: '("true" | "false") space',
|
const separatorRule = opts.separatorRule ?? '';
|
||||||
number: '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
|
const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false
|
||||||
integer: '("-"? ([0-9] | [1-9] [0-9]*)) space',
|
|
||||||
value: 'object | array | string | number | boolean',
|
if (separatorRule === '') {
|
||||||
object: '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
|
if (minItems === 0 && maxItems === 1) {
|
||||||
array: '"[" space ( value ("," space value)* )? "]" space',
|
return `${itemRule}?`;
|
||||||
uuid: '"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space',
|
} else if (minItems === 1 && maxItems === undefined) {
|
||||||
string: ` "\\"" (
|
return `${itemRule}+`;
|
||||||
[^"\\\\] |
|
}
|
||||||
"\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
}
|
||||||
)* "\\"" space`,
|
|
||||||
null: '"null" space',
|
let result = '';
|
||||||
|
if (minItems > 0) {
|
||||||
|
if (itemRuleIsLiteral && separatorRule === '') {
|
||||||
|
result = `"${itemRule.slice(1, -1).repeat(minItems)}"`;
|
||||||
|
} else {
|
||||||
|
result = Array.from({ length: minItems }, () => itemRule)
|
||||||
|
.join(separatorRule !== '' ? ` ${separatorRule} ` : ' ');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const optRepetitions = (upToN, prefixWithSep=false) => {
|
||||||
|
const content = separatorRule !== '' && prefixWithSep ? `${separatorRule} ${itemRule}` : itemRule;
|
||||||
|
if (upToN === 0) {
|
||||||
|
return '';
|
||||||
|
} else if (upToN === 1) {
|
||||||
|
return `(${content})?`;
|
||||||
|
} else if (separatorRule !== '' && !prefixWithSep) {
|
||||||
|
return `(${content} ${optRepetitions(upToN - 1, true)})?`;
|
||||||
|
} else {
|
||||||
|
return Array.from({ length: upToN }, () => `(${content}`).join(' ').trim() + Array.from({ length: upToN }, () => ')?').join('');
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (minItems > 0 && maxItems !== minItems) {
|
||||||
|
result += ' ';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (maxItems !== undefined) {
|
||||||
|
result += optRepetitions(maxItems - minItems, minItems > 0);
|
||||||
|
} else {
|
||||||
|
const itemOperator = `(${separatorRule !== '' ? separatorRule + ' ' : ''}${itemRule})`;
|
||||||
|
|
||||||
|
if (minItems === 0 && separatorRule !== '') {
|
||||||
|
result = `(${itemRule} ${itemOperator}*)?`;
|
||||||
|
} else {
|
||||||
|
result += `${itemOperator}*`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
class BuiltinRule {
|
||||||
|
constructor(content, deps) {
|
||||||
|
this.content = content;
|
||||||
|
this.deps = deps || [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const UP_TO_15_DIGITS = _buildRepetition('[0-9]', 0, 15);
|
||||||
|
|
||||||
|
const PRIMITIVE_RULES = {
|
||||||
|
boolean : new BuiltinRule('("true" | "false") space', []),
|
||||||
|
'decimal-part' : new BuiltinRule('[0-9] ' + UP_TO_15_DIGITS, []),
|
||||||
|
'integral-part': new BuiltinRule('[0-9] | [1-9] ' + UP_TO_15_DIGITS, []),
|
||||||
|
number : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
|
||||||
|
integer : new BuiltinRule('("-"? integral-part) space', ['integral-part']),
|
||||||
|
value : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
|
||||||
|
object : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
|
||||||
|
array : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
|
||||||
|
uuid : new BuiltinRule('"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space', []),
|
||||||
|
char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])`, []),
|
||||||
|
string : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
|
||||||
|
null : new BuiltinRule('"null" space', []),
|
||||||
};
|
};
|
||||||
const OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value'];
|
|
||||||
|
|
||||||
// TODO: support "uri", "email" string formats
|
// TODO: support "uri", "email" string formats
|
||||||
const DATE_RULES = {
|
const STRING_FORMAT_RULES = {
|
||||||
'date' : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
|
'date' : new BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
|
||||||
'time' : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
|
'time' : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
|
||||||
'date-time': 'date "T" time',
|
'date-time' : new BuiltinRule('date "T" time', ['date', 'time']),
|
||||||
'date-string': '"\\"" date "\\"" space',
|
'date-string' : new BuiltinRule('"\\"" date "\\"" space', ['date']),
|
||||||
'time-string': '"\\"" time "\\"" space',
|
'time-string' : new BuiltinRule('"\\"" time "\\"" space', ['time']),
|
||||||
'date-time-string': '"\\"" date-time "\\"" space',
|
'date-time-string': new BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
|
||||||
};
|
}
|
||||||
|
|
||||||
const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...DATE_RULES};
|
const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...STRING_FORMAT_RULES};
|
||||||
|
|
||||||
const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g;
|
const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g;
|
||||||
const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g;
|
const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g;
|
||||||
|
@ -158,7 +220,7 @@ export class SchemaConverter {
|
||||||
rule = '[\\U00000000-\\U0010FFFF]';
|
rule = '[\\U00000000-\\U0010FFFF]';
|
||||||
} else {
|
} else {
|
||||||
// Accept any character... except \n and \r line break chars (\x0A and \xOD)
|
// Accept any character... except \n and \r line break chars (\x0A and \xOD)
|
||||||
rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]';
|
rule = '[^\\x0A\\x0D]';
|
||||||
}
|
}
|
||||||
return this._addRule('dot', rule);
|
return this._addRule('dot', rule);
|
||||||
};
|
};
|
||||||
|
@ -259,13 +321,6 @@ export class SchemaConverter {
|
||||||
|
|
||||||
let [sub, subIsLiteral] = seq[seq.length - 1];
|
let [sub, subIsLiteral] = seq[seq.length - 1];
|
||||||
|
|
||||||
if (minTimes === 0 && maxTimes === Infinity) {
|
|
||||||
seq[seq.length - 1] = [`${sub}*`, false];
|
|
||||||
} else if (minTimes === 0 && maxTimes === 1) {
|
|
||||||
seq[seq.length - 1] = [`${sub}?`, false];
|
|
||||||
} else if (minTimes === 1 && maxTimes === Infinity) {
|
|
||||||
seq[seq.length - 1] = [`${sub}+`, false];
|
|
||||||
} else {
|
|
||||||
if (!subIsLiteral) {
|
if (!subIsLiteral) {
|
||||||
let id = subRuleIds[sub];
|
let id = subRuleIds[sub];
|
||||||
if (id === undefined) {
|
if (id === undefined) {
|
||||||
|
@ -275,10 +330,10 @@ export class SchemaConverter {
|
||||||
sub = id;
|
sub = id;
|
||||||
}
|
}
|
||||||
|
|
||||||
const repeatedSub = Array.from({ length: minTimes }, () => subIsLiteral ? `"${sub.slice(1, -1).repeat(minTimes)}"` : sub);
|
seq[seq.length - 1] = [
|
||||||
const optionalSub = maxTimes !== undefined ? Array.from({ length: maxTimes - minTimes }, () => `${sub}?`) : [`${sub}*`];
|
_buildRepetition(subIsLiteral ? `"${sub}"` : sub, minTimes, maxTimes, {itemRuleIsLiteral: subIsLiteral}),
|
||||||
seq[seq.length - 1] = [repeatedSub.concat(optionalSub).join(' '), false];
|
false
|
||||||
}
|
];
|
||||||
} else {
|
} else {
|
||||||
let literal = '';
|
let literal = '';
|
||||||
while (i < length) {
|
while (i < length) {
|
||||||
|
@ -394,49 +449,50 @@ export class SchemaConverter {
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
const itemRuleName = this.visit(items, `${name ?? ''}${name ? '-' : ''}item`);
|
const itemRuleName = this.visit(items, `${name ?? ''}${name ? '-' : ''}item`);
|
||||||
const listItemOperator = `( "," space ${itemRuleName} )`;
|
const minItems = schema.minItems || 0;
|
||||||
let successiveItems = '';
|
|
||||||
let minItems = schema.minItems || 0;
|
|
||||||
const maxItems = schema.maxItems;
|
const maxItems = schema.maxItems;
|
||||||
if (minItems > 0) {
|
return this._addRule(ruleName, '"[" space ' + _buildRepetition(itemRuleName, minItems, maxItems, {separatorRule: '"," space'}) + ' "]" space');
|
||||||
successiveItems = listItemOperator.repeat(minItems - 1);
|
|
||||||
minItems--;
|
|
||||||
}
|
|
||||||
if (maxItems !== undefined && maxItems > minItems) {
|
|
||||||
successiveItems += `${listItemOperator}?`.repeat(maxItems - minItems - 1);
|
|
||||||
} else {
|
|
||||||
successiveItems += `${listItemOperator}*`;
|
|
||||||
}
|
|
||||||
const rule = minItems === 0
|
|
||||||
? `"[" space ( ${itemRuleName} ${successiveItems} )? "]" space`
|
|
||||||
: `"[" space ${itemRuleName} ${successiveItems} "]" space`;
|
|
||||||
return this._addRule(ruleName, rule);
|
|
||||||
}
|
}
|
||||||
} else if ((schemaType === undefined || schemaType === 'string') && 'pattern' in schema) {
|
} else if ((schemaType === undefined || schemaType === 'string') && 'pattern' in schema) {
|
||||||
return this._visitPattern(schema.pattern, ruleName);
|
return this._visitPattern(schema.pattern, ruleName);
|
||||||
} else if ((schemaType === undefined || schemaType === 'string') && /^uuid[1-5]?$/.test(schema.format || '')) {
|
} else if ((schemaType === undefined || schemaType === 'string') && /^uuid[1-5]?$/.test(schema.format || '')) {
|
||||||
return this._addRule(
|
return this._addPrimitive(
|
||||||
ruleName === 'root' ? 'root' : schemaFormat,
|
ruleName === 'root' ? 'root' : schemaFormat,
|
||||||
PRIMITIVE_RULES['uuid'])
|
PRIMITIVE_RULES['uuid']
|
||||||
} else if ((schemaType === undefined || schemaType === 'string') && schema.format in DATE_RULES) {
|
);
|
||||||
for (const [t, r] of Object.entries(DATE_RULES)) {
|
} else if ((schemaType === undefined || schemaType === 'string') && `${schema.format}-string` in STRING_FORMAT_RULES) {
|
||||||
this._addRule(t, r);
|
const primName = `${schema.format}-string`
|
||||||
}
|
return this._addRule(ruleName, this._addPrimitive(primName, STRING_FORMAT_RULES[primName]));
|
||||||
return schemaFormat + '-string';
|
} else if (schemaType === 'string' && ('minLength' in schema || 'maxLength' in schema)) {
|
||||||
|
const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']);
|
||||||
|
const minLen = schema.minLength || 0;
|
||||||
|
const maxLen = schema.maxLength;
|
||||||
|
return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
|
||||||
} else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
|
} else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
|
||||||
for (const n of OBJECT_RULE_NAMES) {
|
return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
|
||||||
this._addRule(n, PRIMITIVE_RULES[n]);
|
|
||||||
}
|
|
||||||
return this._addRule(ruleName, 'object');
|
|
||||||
} else {
|
} else {
|
||||||
if (!(schemaType in PRIMITIVE_RULES)) {
|
if (!(schemaType in PRIMITIVE_RULES)) {
|
||||||
throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`);
|
throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`);
|
||||||
}
|
}
|
||||||
// TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
|
// TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
|
||||||
return this._addRule(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]);
|
return this._addPrimitive(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_addPrimitive(name, rule) {
|
||||||
|
let n = this._addRule(name, rule.content);
|
||||||
|
for (const dep of rule.deps) {
|
||||||
|
const depRule = PRIMITIVE_RULES[dep] || STRING_FORMAT_RULES[dep];
|
||||||
|
if (!depRule) {
|
||||||
|
throw new Error(`Rule ${dep} not known`);
|
||||||
|
}
|
||||||
|
if (!(dep in this._rules)) {
|
||||||
|
this._addPrimitive(dep, depRule);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
_buildObjectRule(properties, required, name, additionalProperties) {
|
_buildObjectRule(properties, required, name, additionalProperties) {
|
||||||
const propOrder = this._propOrder;
|
const propOrder = this._propOrder;
|
||||||
// sort by position in prop_order (if specified) then by original order
|
// sort by position in prop_order (if specified) then by original order
|
||||||
|
@ -462,7 +518,7 @@ export class SchemaConverter {
|
||||||
const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`);
|
const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`);
|
||||||
propKvRuleNames['*'] = this._addRule(
|
propKvRuleNames['*'] = this._addRule(
|
||||||
`${subName}-kv`,
|
`${subName}-kv`,
|
||||||
`${this._addRule('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
|
`${this._addPrimitive('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
|
||||||
optionalProps.push('*');
|
optionalProps.push('*');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -61,7 +61,10 @@ enum server_task_type {
|
||||||
SERVER_TASK_TYPE_COMPLETION,
|
SERVER_TASK_TYPE_COMPLETION,
|
||||||
SERVER_TASK_TYPE_CANCEL,
|
SERVER_TASK_TYPE_CANCEL,
|
||||||
SERVER_TASK_TYPE_NEXT_RESPONSE,
|
SERVER_TASK_TYPE_NEXT_RESPONSE,
|
||||||
SERVER_TASK_TYPE_METRICS
|
SERVER_TASK_TYPE_METRICS,
|
||||||
|
SERVER_TASK_TYPE_SLOT_SAVE,
|
||||||
|
SERVER_TASK_TYPE_SLOT_RESTORE,
|
||||||
|
SERVER_TASK_TYPE_SLOT_ERASE,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct server_task {
|
struct server_task {
|
||||||
|
@ -128,6 +131,7 @@ struct server_params {
|
||||||
|
|
||||||
bool slots_endpoint = true;
|
bool slots_endpoint = true;
|
||||||
bool metrics_endpoint = false;
|
bool metrics_endpoint = false;
|
||||||
|
std::string slot_save_path;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct server_slot {
|
struct server_slot {
|
||||||
|
@ -685,6 +689,7 @@ struct server_context {
|
||||||
n_ctx = llama_n_ctx(ctx);
|
n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
add_bos_token = llama_should_add_bos_token(model);
|
add_bos_token = llama_should_add_bos_token(model);
|
||||||
|
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -754,7 +759,7 @@ struct server_context {
|
||||||
metrics.init();
|
metrics.init();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const {
|
std::vector<llama_token> tokenize(const json & json_prompt, bool add_special) const {
|
||||||
// TODO: currently, we tokenize using special tokens by default
|
// TODO: currently, we tokenize using special tokens by default
|
||||||
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
|
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
|
||||||
// but it's better compared to completely ignoring ChatML and other chat templates
|
// but it's better compared to completely ignoring ChatML and other chat templates
|
||||||
|
@ -772,7 +777,7 @@ struct server_context {
|
||||||
|
|
||||||
std::vector<llama_token> p;
|
std::vector<llama_token> p;
|
||||||
if (first) {
|
if (first) {
|
||||||
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
||||||
first = false;
|
first = false;
|
||||||
} else {
|
} else {
|
||||||
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||||
|
@ -789,7 +794,7 @@ struct server_context {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
auto s = json_prompt.template get<std::string>();
|
auto s = json_prompt.template get<std::string>();
|
||||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
return prompt_tokens;
|
return prompt_tokens;
|
||||||
|
@ -854,7 +859,7 @@ struct server_context {
|
||||||
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
||||||
|
|
||||||
// process "json_schema" and "grammar"
|
// process "json_schema" and "grammar"
|
||||||
if (data.contains("json_schema") && data.contains("grammar")) {
|
if (data.contains("json_schema") && !data["json_schema"].is_null() && data.contains("grammar") && !data["grammar"].is_null()) {
|
||||||
send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
|
send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
|
||||||
return false;
|
return false;
|
||||||
} else if (data.contains("json_schema") && !data.contains("grammar")) {
|
} else if (data.contains("json_schema") && !data.contains("grammar")) {
|
||||||
|
@ -1054,7 +1059,7 @@ struct server_context {
|
||||||
system_tokens.clear();
|
system_tokens.clear();
|
||||||
|
|
||||||
if (!system_prompt.empty()) {
|
if (!system_prompt.empty()) {
|
||||||
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
|
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
|
@ -1078,7 +1083,7 @@ struct server_context {
|
||||||
};
|
};
|
||||||
|
|
||||||
if (llama_decode(ctx, batch_view) != 0) {
|
if (llama_decode(ctx, batch_view) != 0) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG_ERROR("llama_decode() failed", {});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1276,7 +1281,11 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
void send_error(const int id_task, const int id_multi, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
|
void send_error(const int id_task, const int id_multi, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
|
||||||
LOG_TEE("task %i - error: %s\n", id_task, error.c_str());
|
LOG_ERROR("task error", {
|
||||||
|
{"id_multi", id_multi},
|
||||||
|
{"id_task", id_task},
|
||||||
|
{"error", error},
|
||||||
|
});
|
||||||
|
|
||||||
server_task_result res;
|
server_task_result res;
|
||||||
res.id = id_task;
|
res.id = id_task;
|
||||||
|
@ -1612,6 +1621,107 @@ struct server_context {
|
||||||
}
|
}
|
||||||
queue_results.send(res);
|
queue_results.send(res);
|
||||||
} break;
|
} break;
|
||||||
|
case SERVER_TASK_TYPE_SLOT_SAVE:
|
||||||
|
{
|
||||||
|
int id_slot = task.data["id_slot"];
|
||||||
|
server_slot * slot = get_slot(id_slot);
|
||||||
|
if (slot == nullptr) {
|
||||||
|
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t token_count = slot->cache_tokens.size();
|
||||||
|
const int64_t t_start = ggml_time_us();
|
||||||
|
|
||||||
|
std::string filename = task.data["filename"];
|
||||||
|
std::string filepath = task.data["filepath"];
|
||||||
|
|
||||||
|
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
|
||||||
|
|
||||||
|
const int64_t t_end = ggml_time_us();
|
||||||
|
const double t_save_ms = (t_end - t_start) / 1000.0;
|
||||||
|
|
||||||
|
server_task_result result;
|
||||||
|
result.id = task.id;
|
||||||
|
result.stop = true;
|
||||||
|
result.error = false;
|
||||||
|
result.data = json {
|
||||||
|
{ "id_slot", id_slot },
|
||||||
|
{ "filename", filename },
|
||||||
|
{ "n_saved", token_count }, // tokens saved
|
||||||
|
{ "n_written", nwrite }, // bytes written
|
||||||
|
{ "timings", {
|
||||||
|
{ "save_ms", t_save_ms }
|
||||||
|
} }
|
||||||
|
};
|
||||||
|
queue_results.send(result);
|
||||||
|
} break;
|
||||||
|
case SERVER_TASK_TYPE_SLOT_RESTORE:
|
||||||
|
{
|
||||||
|
int id_slot = task.data["id_slot"];
|
||||||
|
server_slot * slot = get_slot(id_slot);
|
||||||
|
if (slot == nullptr) {
|
||||||
|
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t t_start = ggml_time_us();
|
||||||
|
|
||||||
|
std::string filename = task.data["filename"];
|
||||||
|
std::string filepath = task.data["filepath"];
|
||||||
|
|
||||||
|
slot->cache_tokens.resize(slot->n_ctx);
|
||||||
|
size_t token_count = 0;
|
||||||
|
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
|
||||||
|
if (nread == 0) {
|
||||||
|
slot->cache_tokens.resize(0);
|
||||||
|
send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
slot->cache_tokens.resize(token_count);
|
||||||
|
|
||||||
|
const int64_t t_end = ggml_time_us();
|
||||||
|
const double t_restore_ms = (t_end - t_start) / 1000.0;
|
||||||
|
|
||||||
|
server_task_result result;
|
||||||
|
result.id = task.id;
|
||||||
|
result.stop = true;
|
||||||
|
result.error = false;
|
||||||
|
result.data = json {
|
||||||
|
{ "id_slot", id_slot },
|
||||||
|
{ "filename", filename },
|
||||||
|
{ "n_restored", token_count }, // tokens restored
|
||||||
|
{ "n_read", nread }, // bytes read
|
||||||
|
{ "timings", {
|
||||||
|
{ "restore_ms", t_restore_ms }
|
||||||
|
} }
|
||||||
|
};
|
||||||
|
queue_results.send(result);
|
||||||
|
} break;
|
||||||
|
case SERVER_TASK_TYPE_SLOT_ERASE:
|
||||||
|
{
|
||||||
|
int id_slot = task.data["id_slot"];
|
||||||
|
server_slot * slot = get_slot(id_slot);
|
||||||
|
if (slot == nullptr) {
|
||||||
|
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Erase token cache
|
||||||
|
const size_t n_erased = slot->cache_tokens.size();
|
||||||
|
llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1);
|
||||||
|
slot->cache_tokens.clear();
|
||||||
|
|
||||||
|
server_task_result result;
|
||||||
|
result.id = task.id;
|
||||||
|
result.stop = true;
|
||||||
|
result.error = false;
|
||||||
|
result.data = json {
|
||||||
|
{ "id_slot", id_slot },
|
||||||
|
{ "n_erased", n_erased }
|
||||||
|
};
|
||||||
|
queue_results.send(result);
|
||||||
|
} break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1809,7 +1919,7 @@ struct server_context {
|
||||||
prefix_tokens.push_back(llama_token_middle(model));
|
prefix_tokens.push_back(llama_token_middle(model));
|
||||||
prompt_tokens = prefix_tokens;
|
prompt_tokens = prefix_tokens;
|
||||||
} else {
|
} else {
|
||||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
|
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.n_past = 0;
|
slot.n_past = 0;
|
||||||
|
@ -2080,7 +2190,11 @@ struct server_context {
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
if (n_batch == 1 || ret < 0) {
|
if (n_batch == 1 || ret < 0) {
|
||||||
// if you get here, it means the KV cache is full - try increasing it via the context size
|
// if you get here, it means the KV cache is full - try increasing it via the context size
|
||||||
LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
|
LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", {
|
||||||
|
{"i", i},
|
||||||
|
{"n_batch", ret},
|
||||||
|
{"ret", ret},
|
||||||
|
});
|
||||||
for (auto & slot : slots) {
|
for (auto & slot : slots) {
|
||||||
slot.state = SLOT_STATE_PROCESSING;
|
slot.state = SLOT_STATE_PROCESSING;
|
||||||
slot.command = SLOT_COMMAND_NONE;
|
slot.command = SLOT_COMMAND_NONE;
|
||||||
|
@ -2090,12 +2204,16 @@ struct server_context {
|
||||||
break; // break loop of n_batch
|
break; // break loop of n_batch
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
|
|
||||||
|
|
||||||
// retry with half the batch size to try to find a free slot in the KV cache
|
// retry with half the batch size to try to find a free slot in the KV cache
|
||||||
n_batch /= 2;
|
n_batch /= 2;
|
||||||
i -= n_batch;
|
i -= n_batch;
|
||||||
|
|
||||||
|
LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", {
|
||||||
|
{"i", i},
|
||||||
|
{"n_batch", n_batch},
|
||||||
|
{"ret", ret},
|
||||||
|
});
|
||||||
|
|
||||||
continue; // continue loop of n_batch
|
continue; // continue loop of n_batch
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2249,6 +2367,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
||||||
printf(" --log-disable disables logging to a file.\n");
|
printf(" --log-disable disables logging to a file.\n");
|
||||||
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
||||||
printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
|
printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
|
||||||
|
printf(" --slot-save-path PATH path to save slot kv cache (default: disabled)\n");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
||||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||||
|
@ -2657,6 +2776,16 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||||
sparams.slots_endpoint = false;
|
sparams.slots_endpoint = false;
|
||||||
} else if (arg == "--metrics") {
|
} else if (arg == "--metrics") {
|
||||||
sparams.metrics_endpoint = true;
|
sparams.metrics_endpoint = true;
|
||||||
|
} else if (arg == "--slot-save-path") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
sparams.slot_save_path = argv[i];
|
||||||
|
// if doesn't end with DIRECTORY_SEPARATOR, add it
|
||||||
|
if (!sparams.slot_save_path.empty() && sparams.slot_save_path[sparams.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
|
||||||
|
sparams.slot_save_path += DIRECTORY_SEPARATOR;
|
||||||
|
}
|
||||||
} else if (arg == "--chat-template") {
|
} else if (arg == "--chat-template") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -3159,6 +3288,112 @@ int main(int argc, char ** argv) {
|
||||||
res.status = 200; // HTTP OK
|
res.status = 200; // HTTP OK
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
||||||
|
json request_data = json::parse(req.body);
|
||||||
|
std::string filename = request_data["filename"];
|
||||||
|
if (!validate_file_name(filename)) {
|
||||||
|
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
std::string filepath = sparams.slot_save_path + filename;
|
||||||
|
|
||||||
|
server_task task;
|
||||||
|
task.type = SERVER_TASK_TYPE_SLOT_SAVE;
|
||||||
|
task.data = {
|
||||||
|
{ "id_slot", id_slot },
|
||||||
|
{ "filename", filename },
|
||||||
|
{ "filepath", filepath }
|
||||||
|
};
|
||||||
|
|
||||||
|
const int id_task = ctx_server.queue_tasks.post(task);
|
||||||
|
ctx_server.queue_results.add_waiting_task_id(id_task);
|
||||||
|
|
||||||
|
server_task_result result = ctx_server.queue_results.recv(id_task);
|
||||||
|
ctx_server.queue_results.remove_waiting_task_id(id_task);
|
||||||
|
|
||||||
|
if (result.error) {
|
||||||
|
res_error(res, result.data);
|
||||||
|
} else {
|
||||||
|
res.set_content(result.data.dump(), "application/json");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
||||||
|
json request_data = json::parse(req.body);
|
||||||
|
std::string filename = request_data["filename"];
|
||||||
|
if (!validate_file_name(filename)) {
|
||||||
|
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
std::string filepath = sparams.slot_save_path + filename;
|
||||||
|
|
||||||
|
server_task task;
|
||||||
|
task.type = SERVER_TASK_TYPE_SLOT_RESTORE;
|
||||||
|
task.data = {
|
||||||
|
{ "id_slot", id_slot },
|
||||||
|
{ "filename", filename },
|
||||||
|
{ "filepath", filepath }
|
||||||
|
};
|
||||||
|
|
||||||
|
const int id_task = ctx_server.queue_tasks.post(task);
|
||||||
|
ctx_server.queue_results.add_waiting_task_id(id_task);
|
||||||
|
|
||||||
|
server_task_result result = ctx_server.queue_results.recv(id_task);
|
||||||
|
ctx_server.queue_results.remove_waiting_task_id(id_task);
|
||||||
|
|
||||||
|
if (result.error) {
|
||||||
|
res_error(res, result.data);
|
||||||
|
} else {
|
||||||
|
res.set_content(result.data.dump(), "application/json");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const auto handle_slots_erase = [&ctx_server, &res_error](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
|
||||||
|
server_task task;
|
||||||
|
task.type = SERVER_TASK_TYPE_SLOT_ERASE;
|
||||||
|
task.data = {
|
||||||
|
{ "id_slot", id_slot },
|
||||||
|
};
|
||||||
|
|
||||||
|
const int id_task = ctx_server.queue_tasks.post(task);
|
||||||
|
ctx_server.queue_results.add_waiting_task_id(id_task);
|
||||||
|
|
||||||
|
server_task_result result = ctx_server.queue_results.recv(id_task);
|
||||||
|
ctx_server.queue_results.remove_waiting_task_id(id_task);
|
||||||
|
|
||||||
|
if (result.error) {
|
||||||
|
res_error(res, result.data);
|
||||||
|
} else {
|
||||||
|
res.set_content(result.data.dump(), "application/json");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const auto handle_slots_action = [&res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) {
|
||||||
|
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||||
|
|
||||||
|
std::string id_slot_str = req.path_params.at("id_slot");
|
||||||
|
int id_slot;
|
||||||
|
|
||||||
|
try {
|
||||||
|
id_slot = std::stoi(id_slot_str);
|
||||||
|
} catch (const std::exception &) {
|
||||||
|
res_error(res, format_error_response("Invalid slot ID", ERROR_TYPE_INVALID_REQUEST));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string action = req.get_param_value("action");
|
||||||
|
|
||||||
|
if (action == "save") {
|
||||||
|
handle_slots_save(req, res, id_slot);
|
||||||
|
} else if (action == "restore") {
|
||||||
|
handle_slots_restore(req, res, id_slot);
|
||||||
|
} else if (action == "erase") {
|
||||||
|
handle_slots_erase(req, res, id_slot);
|
||||||
|
} else {
|
||||||
|
res_error(res, format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
|
||||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||||
json data = {
|
json data = {
|
||||||
|
@ -3521,6 +3756,10 @@ int main(int argc, char ** argv) {
|
||||||
svr->Post("/v1/embeddings", handle_embeddings);
|
svr->Post("/v1/embeddings", handle_embeddings);
|
||||||
svr->Post("/tokenize", handle_tokenize);
|
svr->Post("/tokenize", handle_tokenize);
|
||||||
svr->Post("/detokenize", handle_detokenize);
|
svr->Post("/detokenize", handle_detokenize);
|
||||||
|
if (!sparams.slot_save_path.empty()) {
|
||||||
|
// only enable slot endpoints if slot_save_path is set
|
||||||
|
svr->Post("/slots/:id_slot", handle_slots_action);
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// Start the server
|
// Start the server
|
||||||
|
|
58
examples/server/tests/features/slotsave.feature
Normal file
58
examples/server/tests/features/slotsave.feature
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
@llama.cpp
|
||||||
|
@slotsave
|
||||||
|
Feature: llama.cpp server slot management
|
||||||
|
|
||||||
|
Background: Server startup
|
||||||
|
Given a server listening on localhost:8080
|
||||||
|
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||||
|
And prompt caching is enabled
|
||||||
|
And 2 slots
|
||||||
|
And . as slot save path
|
||||||
|
And 2048 KV cache size
|
||||||
|
And 42 as server seed
|
||||||
|
And 24 max tokens to predict
|
||||||
|
Then the server is starting
|
||||||
|
Then the server is healthy
|
||||||
|
|
||||||
|
Scenario: Save and Restore Slot
|
||||||
|
# First prompt in slot 1 should be fully processed
|
||||||
|
Given a user prompt "What is the capital of France?"
|
||||||
|
And using slot id 1
|
||||||
|
And a completion request with no api error
|
||||||
|
Then 24 tokens are predicted matching (Lily|cake)
|
||||||
|
And 22 prompt tokens are processed
|
||||||
|
When the slot 1 is saved with filename "slot1.bin"
|
||||||
|
Then the server responds with status code 200
|
||||||
|
# Since we have cache, this should only process the last tokens
|
||||||
|
Given a user prompt "What is the capital of Germany?"
|
||||||
|
And a completion request with no api error
|
||||||
|
Then 24 tokens are predicted matching (Thank|special)
|
||||||
|
And 7 prompt tokens are processed
|
||||||
|
# Loading the original cache into slot 0,
|
||||||
|
# we should only be processing 1 prompt token and get the same output
|
||||||
|
When the slot 0 is restored with filename "slot1.bin"
|
||||||
|
Then the server responds with status code 200
|
||||||
|
Given a user prompt "What is the capital of France?"
|
||||||
|
And using slot id 0
|
||||||
|
And a completion request with no api error
|
||||||
|
Then 24 tokens are predicted matching (Lily|cake)
|
||||||
|
And 1 prompt tokens are processed
|
||||||
|
# For verification that slot 1 was not corrupted during slot 0 load, same thing
|
||||||
|
Given a user prompt "What is the capital of Germany?"
|
||||||
|
And using slot id 1
|
||||||
|
And a completion request with no api error
|
||||||
|
Then 24 tokens are predicted matching (Thank|special)
|
||||||
|
And 1 prompt tokens are processed
|
||||||
|
|
||||||
|
Scenario: Erase Slot
|
||||||
|
Given a user prompt "What is the capital of France?"
|
||||||
|
And using slot id 1
|
||||||
|
And a completion request with no api error
|
||||||
|
Then 24 tokens are predicted matching (Lily|cake)
|
||||||
|
And 22 prompt tokens are processed
|
||||||
|
When the slot 1 is erased
|
||||||
|
Then the server responds with status code 200
|
||||||
|
Given a user prompt "What is the capital of France?"
|
||||||
|
And a completion request with no api error
|
||||||
|
Then 24 tokens are predicted matching (Lily|cake)
|
||||||
|
And 22 prompt tokens are processed
|
|
@ -49,6 +49,9 @@ def step_server_config(context, server_fqdn, server_port):
|
||||||
context.n_predict = None
|
context.n_predict = None
|
||||||
context.n_prompts = 0
|
context.n_prompts = 0
|
||||||
context.n_server_predict = None
|
context.n_server_predict = None
|
||||||
|
context.slot_save_path = None
|
||||||
|
context.id_slot = None
|
||||||
|
context.cache_prompt = None
|
||||||
context.n_slots = None
|
context.n_slots = None
|
||||||
context.prompt_prefix = None
|
context.prompt_prefix = None
|
||||||
context.prompt_suffix = None
|
context.prompt_suffix = None
|
||||||
|
@ -119,6 +122,21 @@ def step_server_n_predict(context, n_predict):
|
||||||
context.n_server_predict = n_predict
|
context.n_server_predict = n_predict
|
||||||
|
|
||||||
|
|
||||||
|
@step('{slot_save_path} as slot save path')
|
||||||
|
def step_slot_save_path(context, slot_save_path):
|
||||||
|
context.slot_save_path = slot_save_path
|
||||||
|
|
||||||
|
|
||||||
|
@step('using slot id {id_slot:d}')
|
||||||
|
def step_id_slot(context, id_slot):
|
||||||
|
context.id_slot = id_slot
|
||||||
|
|
||||||
|
|
||||||
|
@step('prompt caching is enabled')
|
||||||
|
def step_enable_prompt_cache(context):
|
||||||
|
context.cache_prompt = True
|
||||||
|
|
||||||
|
|
||||||
@step('continuous batching')
|
@step('continuous batching')
|
||||||
def step_server_continuous_batching(context):
|
def step_server_continuous_batching(context):
|
||||||
context.server_continuous_batching = True
|
context.server_continuous_batching = True
|
||||||
|
@ -212,6 +230,8 @@ async def step_request_completion(context, api_error):
|
||||||
context.base_url,
|
context.base_url,
|
||||||
debug=context.debug,
|
debug=context.debug,
|
||||||
n_predict=context.n_predict,
|
n_predict=context.n_predict,
|
||||||
|
cache_prompt=context.cache_prompt,
|
||||||
|
id_slot=context.id_slot,
|
||||||
seed=await completions_seed(context),
|
seed=await completions_seed(context),
|
||||||
expect_api_error=expect_api_error,
|
expect_api_error=expect_api_error,
|
||||||
user_api_key=context.user_api_key)
|
user_api_key=context.user_api_key)
|
||||||
|
@ -711,12 +731,48 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
|
||||||
await asyncio.sleep(0.1)
|
await asyncio.sleep(0.1)
|
||||||
|
|
||||||
|
|
||||||
|
@step('the slot {slot_id:d} is saved with filename "{filename}"')
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_save_slot(context, slot_id, filename):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.post(f'{context.base_url}/slots/{slot_id}?action=save',
|
||||||
|
json={"filename": filename},
|
||||||
|
headers={"Content-Type": "application/json"}) as response:
|
||||||
|
context.response = response
|
||||||
|
|
||||||
|
|
||||||
|
@step('the slot {slot_id:d} is restored with filename "{filename}"')
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_restore_slot(context, slot_id, filename):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.post(f'{context.base_url}/slots/{slot_id}?action=restore',
|
||||||
|
json={"filename": filename},
|
||||||
|
headers={"Content-Type": "application/json"}) as response:
|
||||||
|
context.response = response
|
||||||
|
|
||||||
|
|
||||||
|
@step('the slot {slot_id:d} is erased')
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_erase_slot(context, slot_id):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.post(f'{context.base_url}/slots/{slot_id}?action=erase',
|
||||||
|
headers={"Content-Type": "application/json"}) as response:
|
||||||
|
context.response = response
|
||||||
|
|
||||||
|
|
||||||
|
@step('the server responds with status code {status_code:d}')
|
||||||
|
def step_server_responds_with_status_code(context, status_code):
|
||||||
|
assert context.response.status == status_code
|
||||||
|
|
||||||
|
|
||||||
async def request_completion(prompt,
|
async def request_completion(prompt,
|
||||||
base_url,
|
base_url,
|
||||||
debug=False,
|
debug=False,
|
||||||
prompt_prefix=None,
|
prompt_prefix=None,
|
||||||
prompt_suffix=None,
|
prompt_suffix=None,
|
||||||
n_predict=None,
|
n_predict=None,
|
||||||
|
cache_prompt=False,
|
||||||
|
id_slot=None,
|
||||||
seed=None,
|
seed=None,
|
||||||
expect_api_error=None,
|
expect_api_error=None,
|
||||||
user_api_key=None):
|
user_api_key=None):
|
||||||
|
@ -738,6 +794,8 @@ async def request_completion(prompt,
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"input_suffix": prompt_suffix,
|
"input_suffix": prompt_suffix,
|
||||||
"n_predict": n_predict if n_predict is not None else -1,
|
"n_predict": n_predict if n_predict is not None else -1,
|
||||||
|
"cache_prompt": cache_prompt,
|
||||||
|
"id_slot": id_slot,
|
||||||
"seed": seed if seed is not None else 42
|
"seed": seed if seed is not None else 42
|
||||||
},
|
},
|
||||||
headers=headers,
|
headers=headers,
|
||||||
|
@ -1104,6 +1162,8 @@ def start_server_background(context):
|
||||||
server_args.extend(['--parallel', context.n_slots])
|
server_args.extend(['--parallel', context.n_slots])
|
||||||
if context.n_server_predict:
|
if context.n_server_predict:
|
||||||
server_args.extend(['--n-predict', context.n_server_predict])
|
server_args.extend(['--n-predict', context.n_server_predict])
|
||||||
|
if context.slot_save_path:
|
||||||
|
server_args.extend(['--slot-save-path', context.slot_save_path])
|
||||||
if context.server_api_key:
|
if context.server_api_key:
|
||||||
server_args.extend(['--api-key', context.server_api_key])
|
server_args.extend(['--api-key', context.server_api_key])
|
||||||
if context.n_ga:
|
if context.n_ga:
|
||||||
|
|
|
@ -567,6 +567,15 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st
|
||||||
{"model", modelname},
|
{"model", modelname},
|
||||||
{"object", "chat.completion.chunk"}
|
{"object", "chat.completion.chunk"}
|
||||||
};
|
};
|
||||||
|
if (!finish_reason.empty()) {
|
||||||
|
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
|
||||||
|
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
|
||||||
|
ret.push_back({"usage", json {
|
||||||
|
{"completion_tokens", num_tokens_predicted},
|
||||||
|
{"prompt_tokens", num_prompt_tokens},
|
||||||
|
{"total_tokens", num_tokens_predicted + num_prompt_tokens}
|
||||||
|
}});
|
||||||
|
}
|
||||||
|
|
||||||
return std::vector<json>({ret});
|
return std::vector<json>({ret});
|
||||||
}
|
}
|
||||||
|
|
|
@ -76,6 +76,28 @@ int main(int argc, char ** argv) {
|
||||||
params.n_threads_batch = params.n_threads_batch_draft;
|
params.n_threads_batch = params.n_threads_batch_draft;
|
||||||
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
|
||||||
|
LOG("vocab_type tgt: %d\n", vocab_type_tgt);
|
||||||
|
|
||||||
|
const bool vocab_type_dft = llama_vocab_type(model_dft);
|
||||||
|
LOG("vocab_type dft: %d\n", vocab_type_dft);
|
||||||
|
|
||||||
|
if (vocab_type_tgt != vocab_type_dft) {
|
||||||
|
fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__);
|
||||||
|
fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
|
||||||
|
llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
|
||||||
|
llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
|
||||||
|
llama_token_eos(model_tgt) != llama_token_eos(model_dft)
|
||||||
|
) {
|
||||||
|
fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
const int n_vocab_tgt = llama_n_vocab(model_tgt);
|
const int n_vocab_tgt = llama_n_vocab(model_tgt);
|
||||||
const int n_vocab_dft = llama_n_vocab(model_dft);
|
const int n_vocab_dft = llama_n_vocab(model_dft);
|
||||||
|
@ -105,20 +127,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
|
|
||||||
// Tokenize the prompt
|
// Tokenize the prompt
|
||||||
const bool add_bos_tgt = llama_should_add_bos_token(model_tgt);
|
|
||||||
LOG("add_bos tgt: %d\n", add_bos_tgt);
|
|
||||||
|
|
||||||
const bool add_bos_dft = llama_should_add_bos_token(model_dft);
|
|
||||||
LOG("add_bos dft: %d\n", add_bos_dft);
|
|
||||||
|
|
||||||
if (add_bos_tgt != add_bos_dft) {
|
|
||||||
fprintf(stderr, "%s: error: draft model add_bos must match target model to use speculation but ", __func__);
|
|
||||||
fprintf(stderr, "add_bos_dft = %d while add_bos_tgt = %d\n", add_bos_dft, add_bos_tgt);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true);
|
inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true);
|
||||||
|
|
||||||
const int max_context_size = llama_n_ctx(ctx_tgt);
|
const int max_context_size = llama_n_ctx(ctx_tgt);
|
||||||
const int max_tokens_list_size = max_context_size - 4;
|
const int max_tokens_list_size = max_context_size - 4;
|
||||||
|
|
|
@ -20,4 +20,4 @@ cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
#cmake --build . --config Release --target llama-bench
|
#cmake --build . --config Release --target llama-bench
|
||||||
|
|
||||||
#build all binary
|
#build all binary
|
||||||
cmake --build . --config Release -v
|
cmake --build . --config Release -j -v
|
||||||
|
|
|
@ -12,6 +12,7 @@ if [ $# -gt 0 ]; then
|
||||||
GGML_SYCL_SINGLE_GPU=1
|
GGML_SYCL_SINGLE_GPU=1
|
||||||
else
|
else
|
||||||
GGML_SYCL_DEVICE=0
|
GGML_SYCL_DEVICE=0
|
||||||
|
GGML_SYCL_SINGLE_GPU=0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#export GGML_SYCL_DEBUG=1
|
#export GGML_SYCL_DEBUG=1
|
||||||
|
|
|
@ -26,11 +26,9 @@ int main(int argc, char ** argv) {
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
|
||||||
|
|
||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
|
|
||||||
tokens = ::llama_tokenize(model, prompt, add_bos, true);
|
tokens = ::llama_tokenize(model, prompt, true, true);
|
||||||
|
|
||||||
for (int i = 0; i < (int) tokens.size(); i++) {
|
for (int i = 0; i < (int) tokens.size(); i++) {
|
||||||
if (printing_ids) {
|
if (printing_ids) {
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
#
|
#
|
||||||
# ./examples/ts-type-to-grammar.sh "{a:string,b:string,c?:string}"
|
# ./examples/ts-type-to-grammar.sh "{a:string,b:string,c?:string}"
|
||||||
# python examples/json-schema-to-grammar.py https://json.schemastore.org/tsconfig.json
|
# python examples/json_schema_to_grammar.py https://json.schemastore.org/tsconfig.json
|
||||||
#
|
#
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
@ -25,4 +25,4 @@ npx ts-json-schema-generator --unstable --no-top-ref --path "$DTS_FILE" --type M
|
||||||
# https://github.com/YousefED/typescript-json-schema
|
# https://github.com/YousefED/typescript-json-schema
|
||||||
# npx typescript-json-schema --defaultProps --required "$DTS_FILE" MyType | tee "$SCHEMA_FILE" >&2
|
# npx typescript-json-schema --defaultProps --required "$DTS_FILE" MyType | tee "$SCHEMA_FILE" >&2
|
||||||
|
|
||||||
./examples/json-schema-to-grammar.py "$SCHEMA_FILE"
|
./examples/json_schema_to_grammar.py "$SCHEMA_FILE"
|
||||||
|
|
18
flake.lock
generated
18
flake.lock
generated
|
@ -5,11 +5,11 @@
|
||||||
"nixpkgs-lib": "nixpkgs-lib"
|
"nixpkgs-lib": "nixpkgs-lib"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1709336216,
|
"lastModified": 1712014858,
|
||||||
"narHash": "sha256-Dt/wOWeW6Sqm11Yh+2+t0dfEWxoMxGBvv3JpIocFl9E=",
|
"narHash": "sha256-sB4SWl2lX95bExY2gMFG5HIzvva5AVMJd4Igm+GpZNw=",
|
||||||
"owner": "hercules-ci",
|
"owner": "hercules-ci",
|
||||||
"repo": "flake-parts",
|
"repo": "flake-parts",
|
||||||
"rev": "f7b3c975cf067e56e7cda6cb098ebe3fb4d74ca2",
|
"rev": "9126214d0a59633752a136528f5f3b9aa8565b7d",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1711703276,
|
"lastModified": 1712791164,
|
||||||
"narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
|
"narHash": "sha256-3sbWO1mbpWsLepZGbWaMovSO7ndZeFqDSdX0hZ9nVyw=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
|
"rev": "1042fd8b148a9105f3c0aca3a6177fd1d9360ba5",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -37,11 +37,11 @@
|
||||||
"nixpkgs-lib": {
|
"nixpkgs-lib": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"dir": "lib",
|
"dir": "lib",
|
||||||
"lastModified": 1709237383,
|
"lastModified": 1711703276,
|
||||||
"narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=",
|
"narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8",
|
"rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
|
@ -137,7 +137,7 @@ extern "C" {
|
||||||
/*
|
/*
|
||||||
Example usage:
|
Example usage:
|
||||||
|
|
||||||
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be asigned
|
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
|
||||||
// preferrably to run on the same backend as the buffer
|
// preferrably to run on the same backend as the buffer
|
||||||
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||||
|
|
||||||
|
|
12
ggml-cuda.cu
12
ggml-cuda.cu
|
@ -1226,7 +1226,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||||
|
|
||||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||||
// ldc == nrows of the matrix that cuBLAS writes into
|
// ldc == nrows of the matrix that cuBLAS writes into
|
||||||
int ldc = id == ctx.device ? ne0 : row_diff;
|
int64_t ldc = id == ctx.device ? ne0 : row_diff;
|
||||||
|
|
||||||
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
||||||
|
|
||||||
|
@ -1378,8 +1378,8 @@ static void ggml_cuda_op_mul_mat(
|
||||||
const int64_t ne0 = dst->ne[0];
|
const int64_t ne0 = dst->ne[0];
|
||||||
const int64_t ne1 = dst->ne[1];
|
const int64_t ne1 = dst->ne[1];
|
||||||
|
|
||||||
const int nb2 = dst->nb[2];
|
const int64_t nb2 = dst->nb[2];
|
||||||
const int nb3 = dst->nb[3];
|
const int64_t nb3 = dst->nb[3];
|
||||||
|
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
|
||||||
|
@ -1947,7 +1947,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||||
} else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
} else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||||
// KQV single-batch
|
// KQV single-batch
|
||||||
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
|
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
|
||||||
} else if (!split && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||||
// KQ + KQV multi-batch
|
// KQ + KQV multi-batch
|
||||||
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
||||||
} else if (use_dequantize_mul_mat_vec) {
|
} else if (use_dequantize_mul_mat_vec) {
|
||||||
|
@ -2622,6 +2622,7 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if CUDART_VERSION >= 11100
|
||||||
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
// clear the error
|
// clear the error
|
||||||
|
@ -2632,6 +2633,9 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
#else
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
|
GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
|
||||||
|
|
|
@ -393,7 +393,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
||||||
// TODO: move to ggml-common.h
|
// TODO: move to ggml-common.h
|
||||||
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||||
|
|
||||||
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
||||||
|
|
||||||
|
|
||||||
//////////////////////
|
//////////////////////
|
||||||
|
|
|
@ -4,14 +4,14 @@
|
||||||
#define CUDA_Q8_0_NE_ALIGN 2048
|
#define CUDA_Q8_0_NE_ALIGN 2048
|
||||||
|
|
||||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||||
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
||||||
const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
const int64_t i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
||||||
|
|
||||||
if (i >= k) {
|
if (i >= k) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int ib = i/qk; // block index
|
const int64_t ib = i/qk; // block index
|
||||||
const int iqs = (i%qk)/qr; // quant index
|
const int iqs = (i%qk)/qr; // quant index
|
||||||
const int iybs = i - i%qk; // y block start index
|
const int iybs = i - i%qk; // y block start index
|
||||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||||
|
@ -25,7 +25,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
|
||||||
}
|
}
|
||||||
|
|
||||||
template <bool need_check>
|
template <bool need_check>
|
||||||
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
|
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
|
||||||
#if __CUDA_ARCH__ >= CC_PASCAL
|
#if __CUDA_ARCH__ >= CC_PASCAL
|
||||||
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
||||||
|
|
||||||
|
@ -68,13 +68,13 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int64_t i = blockIdx.x;
|
||||||
|
|
||||||
// assume 32 threads
|
// assume 32 threads
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int il = tid/8;
|
const int il = tid/8;
|
||||||
const int ir = tid%8;
|
const int ir = tid%8;
|
||||||
const int ib = 8*i + ir;
|
const int64_t ib = 8*i + ir;
|
||||||
if (ib >= nb32) {
|
if (ib >= nb32) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -96,13 +96,13 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int64_t i = blockIdx.x;
|
||||||
|
|
||||||
// assume 32 threads
|
// assume 32 threads
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int il = tid/8;
|
const int il = tid/8;
|
||||||
const int ir = tid%8;
|
const int ir = tid%8;
|
||||||
const int ib = 8*i + ir;
|
const int64_t ib = 8*i + ir;
|
||||||
if (ib >= nb32) {
|
if (ib >= nb32) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -313,14 +313,14 @@ template<typename dst_t>
|
||||||
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||||
const block_q6_K * x = (const block_q6_K *) vx;
|
const block_q6_K * x = (const block_q6_K *) vx;
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int64_t i = blockIdx.x;
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
|
|
||||||
// assume 64 threads - this is very slightly better than the one below
|
// assume 64 threads - this is very slightly better than the one below
|
||||||
const int tid = threadIdx.x;
|
const int64_t tid = threadIdx.x;
|
||||||
const int ip = tid/32; // ip is 0 or 1
|
const int64_t ip = tid/32; // ip is 0 or 1
|
||||||
const int il = tid - 32*ip; // 0...32
|
const int64_t il = tid - 32*ip; // 0...32
|
||||||
const int is = 8*ip + il/16;
|
const int64_t is = 8*ip + il/16;
|
||||||
|
|
||||||
dst_t * y = yy + i*QK_K + 128*ip + il;
|
dst_t * y = yy + i*QK_K + 128*ip + il;
|
||||||
|
|
||||||
|
@ -337,9 +337,9 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
||||||
#else
|
#else
|
||||||
|
|
||||||
// assume 32 threads
|
// assume 32 threads
|
||||||
const int tid = threadIdx.x;
|
const int64_t tid = threadIdx.x;
|
||||||
const int ip = tid/16; // 0 or 1
|
const int64_t ip = tid/16; // 0 or 1
|
||||||
const int il = tid - 16*ip; // 0...15
|
const int64_t il = tid - 16*ip; // 0...15
|
||||||
|
|
||||||
dst_t * y = yy + i*QK_K + 16*ip + il;
|
dst_t * y = yy + i*QK_K + 16*ip + il;
|
||||||
|
|
||||||
|
@ -571,12 +571,12 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||||
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||||
const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
|
const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
|
||||||
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
|
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||||
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
|
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
|
||||||
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
|
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
|
||||||
const bool need_check = false;
|
const bool need_check = false;
|
||||||
|
@ -588,7 +588,7 @@ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half *
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
@ -598,7 +598,7 @@ static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cu
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
@ -608,27 +608,27 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb32 = k / 32;
|
const int nb32 = k / 32;
|
||||||
const int nb = (k + 255) / 256;
|
const int nb = (k + 255) / 256;
|
||||||
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb32 = k / 32;
|
const int nb32 = k / 32;
|
||||||
const int nb = (k + 255) / 256;
|
const int nb = (k + 255) / 256;
|
||||||
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
@ -638,7 +638,7 @@ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cu
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
@ -648,55 +648,55 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = (k + QK_K - 1) / QK_K;
|
const int nb = (k + QK_K - 1) / QK_K;
|
||||||
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = (k + QK_K - 1) / QK_K;
|
const int nb = (k + QK_K - 1) / QK_K;
|
||||||
#if QK_K == 64
|
#if QK_K == 64
|
||||||
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
|
@ -706,8 +706,8 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k,
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename src_t, typename dst_t>
|
template <typename src_t, typename dst_t>
|
||||||
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
||||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
if (i >= k) {
|
if (i >= k) {
|
||||||
return;
|
return;
|
||||||
|
@ -719,7 +719,7 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename src_t, typename dst_t>
|
template <typename src_t, typename dst_t>
|
||||||
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||||
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
|
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
|
||||||
|
|
||||||
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
||||||
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
|
|
||||||
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const block_q4_0 * x = (const block_q4_0 *) vx;
|
const block_q4_0 * x = (const block_q4_0 *) vx;
|
||||||
|
|
||||||
const dfloat d = x[ib].d;
|
const dfloat d = x[ib].d;
|
||||||
|
@ -19,7 +19,7 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
|
||||||
#endif // GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const block_q4_1 * x = (const block_q4_1 *) vx;
|
const block_q4_1 * x = (const block_q4_1 *) vx;
|
||||||
|
|
||||||
const dfloat d = __low2half(x[ib].dm);
|
const dfloat d = __low2half(x[ib].dm);
|
||||||
|
@ -39,7 +39,7 @@ static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const in
|
||||||
#endif // GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const block_q5_0 * x = (const block_q5_0 *) vx;
|
const block_q5_0 * x = (const block_q5_0 *) vx;
|
||||||
|
|
||||||
const dfloat d = x[ib].d;
|
const dfloat d = x[ib].d;
|
||||||
|
@ -62,7 +62,7 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
|
||||||
#endif // GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const block_q5_1 * x = (const block_q5_1 *) vx;
|
const block_q5_1 * x = (const block_q5_1 *) vx;
|
||||||
|
|
||||||
const dfloat d = __low2half(x[ib].dm);
|
const dfloat d = __low2half(x[ib].dm);
|
||||||
|
@ -86,7 +86,7 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
|
||||||
#endif // GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const block_q8_0 * x = (const block_q8_0 *) vx;
|
const block_q8_0 * x = (const block_q8_0 *) vx;
|
||||||
|
|
||||||
const dfloat d = x[ib].d;
|
const dfloat d = x[ib].d;
|
||||||
|
|
|
@ -565,7 +565,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const half * x = (const half *) vx;
|
const half * x = (const half *) vx;
|
||||||
|
|
||||||
// automatic half -> float type cast if dfloat == float
|
// automatic half -> float type cast if dfloat == float
|
||||||
|
@ -577,7 +577,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
||||||
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
||||||
// qk = quantized weights per x block
|
// qk = quantized weights per x block
|
||||||
// qr = number of quantized weights per data value in x block
|
// qr = number of quantized weights per data value in x block
|
||||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
|
||||||
|
|
||||||
if (row >= nrows) {
|
if (row >= nrows) {
|
||||||
return;
|
return;
|
||||||
|
@ -598,7 +598,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
||||||
|
|
||||||
for (int i = 0; i < ncols; i += iter_stride) {
|
for (int i = 0; i < ncols; i += iter_stride) {
|
||||||
const int col = i + vals_per_iter*tid;
|
const int col = i + vals_per_iter*tid;
|
||||||
const int ib = (row*ncols + col)/qk; // x block index
|
const int64_t ib = ((int64_t)row*ncols + col)/qk; // x block index
|
||||||
const int iqs = (col%qk)/qr; // x quant index
|
const int iqs = (col%qk)/qr; // x quant index
|
||||||
const int iybs = col - col%qk; // y block start index
|
const int iybs = col - col%qk; // y block start index
|
||||||
|
|
||||||
|
|
|
@ -1,20 +1,20 @@
|
||||||
#include "quantize.cuh"
|
#include "quantize.cuh"
|
||||||
|
|
||||||
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
|
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) {
|
||||||
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
|
const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
if (ix >= kx_padded) {
|
if (ix >= kx_padded) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int iy = blockDim.y*blockIdx.y + threadIdx.y;
|
const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y;
|
||||||
|
|
||||||
const int i_padded = iy*kx_padded + ix;
|
const int64_t i_padded = (int64_t)iy*kx_padded + ix;
|
||||||
|
|
||||||
block_q8_1 * y = (block_q8_1 *) vy;
|
block_q8_1 * y = (block_q8_1 *) vy;
|
||||||
|
|
||||||
const int ib = i_padded / QK8_1; // block index
|
const int64_t ib = i_padded / QK8_1; // block index
|
||||||
const int iqs = i_padded % QK8_1; // quant index
|
const int64_t iqs = i_padded % QK8_1; // quant index
|
||||||
|
|
||||||
const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
|
const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
|
||||||
float amax = fabsf(xi);
|
float amax = fabsf(xi);
|
||||||
|
@ -36,8 +36,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
||||||
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
|
void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) {
|
||||||
const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
||||||
const dim3 num_blocks(block_num_x, ky, 1);
|
const dim3 num_blocks(block_num_x, ky, 1);
|
||||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
||||||
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
||||||
|
|
|
@ -2,4 +2,4 @@
|
||||||
|
|
||||||
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
||||||
|
|
||||||
void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream);
|
void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream);
|
||||||
|
|
|
@ -88,7 +88,7 @@ typedef uint16_t ggml_fp16_internal_t;
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
#include <intrin.h>
|
#include <intrin.h>
|
||||||
#else
|
#else
|
||||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
||||||
#if !defined(__riscv)
|
#if !defined(__riscv)
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
84
ggml-metal.m
84
ggml-metal.m
|
@ -37,11 +37,15 @@ enum ggml_metal_kernel_type {
|
||||||
GGML_METAL_KERNEL_TYPE_DIV_ROW,
|
GGML_METAL_KERNEL_TYPE_DIV_ROW,
|
||||||
GGML_METAL_KERNEL_TYPE_SCALE,
|
GGML_METAL_KERNEL_TYPE_SCALE,
|
||||||
GGML_METAL_KERNEL_TYPE_SCALE_4,
|
GGML_METAL_KERNEL_TYPE_SCALE_4,
|
||||||
|
GGML_METAL_KERNEL_TYPE_CLAMP,
|
||||||
GGML_METAL_KERNEL_TYPE_TANH,
|
GGML_METAL_KERNEL_TYPE_TANH,
|
||||||
GGML_METAL_KERNEL_TYPE_RELU,
|
GGML_METAL_KERNEL_TYPE_RELU,
|
||||||
GGML_METAL_KERNEL_TYPE_GELU,
|
GGML_METAL_KERNEL_TYPE_GELU,
|
||||||
|
GGML_METAL_KERNEL_TYPE_GELU_4,
|
||||||
GGML_METAL_KERNEL_TYPE_GELU_QUICK,
|
GGML_METAL_KERNEL_TYPE_GELU_QUICK,
|
||||||
|
GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,
|
||||||
GGML_METAL_KERNEL_TYPE_SILU,
|
GGML_METAL_KERNEL_TYPE_SILU,
|
||||||
|
GGML_METAL_KERNEL_TYPE_SILU_4,
|
||||||
GGML_METAL_KERNEL_TYPE_SOFT_MAX,
|
GGML_METAL_KERNEL_TYPE_SOFT_MAX,
|
||||||
GGML_METAL_KERNEL_TYPE_SOFT_MAX_4,
|
GGML_METAL_KERNEL_TYPE_SOFT_MAX_4,
|
||||||
GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,
|
GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,
|
||||||
|
@ -477,11 +481,15 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE, scale, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE, scale, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4, scale_4, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4, scale_4, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP, clamp, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH, tanh, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH, tanh, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU, relu, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU, relu, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4, gelu_4, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4, gelu_quick_4, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4, silu_4, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX, soft_max, ctx->support_simdgroup_reduction);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX, soft_max, ctx->support_simdgroup_reduction);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_4, soft_max_4, ctx->support_simdgroup_reduction);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_4, soft_max_4, ctx->support_simdgroup_reduction);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF, diag_mask_inf, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF, diag_mask_inf, true);
|
||||||
|
@ -728,6 +736,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||||
case GGML_OP_MUL:
|
case GGML_OP_MUL:
|
||||||
case GGML_OP_DIV:
|
case GGML_OP_DIV:
|
||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
|
case GGML_OP_CLAMP:
|
||||||
case GGML_OP_SQR:
|
case GGML_OP_SQR:
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
return true;
|
return true;
|
||||||
|
@ -1168,10 +1177,32 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
||||||
|
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
|
} break;
|
||||||
|
case GGML_OP_CLAMP:
|
||||||
|
{
|
||||||
|
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline;
|
||||||
|
|
||||||
|
float min;
|
||||||
|
float max;
|
||||||
|
memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float));
|
||||||
|
memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float));
|
||||||
|
|
||||||
|
[encoder setComputePipelineState:pipeline];
|
||||||
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
[encoder setBytes:&min length:sizeof(min) atIndex:2];
|
||||||
|
[encoder setBytes:&max length:sizeof(max) atIndex:3];
|
||||||
|
|
||||||
|
const int64_t n = ggml_nelements(dst);
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(gf->nodes[i])) {
|
switch (ggml_get_unary_op(gf->nodes[i])) {
|
||||||
|
// we are not taking into account the strides, so for now require contiguous tensors
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
case GGML_UNARY_OP_TANH:
|
case GGML_UNARY_OP_TANH:
|
||||||
{
|
{
|
||||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline;
|
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline;
|
||||||
|
@ -1198,42 +1229,60 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
} break;
|
} break;
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
{
|
{
|
||||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline;
|
int64_t n = ggml_nelements(dst);
|
||||||
|
|
||||||
|
id<MTLComputePipelineState> pipeline = nil;
|
||||||
|
|
||||||
|
if (n % 4 == 0) {
|
||||||
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_4].pipeline;
|
||||||
|
n /= 4;
|
||||||
|
} else {
|
||||||
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline;
|
||||||
|
}
|
||||||
|
|
||||||
[encoder setComputePipelineState:pipeline];
|
[encoder setComputePipelineState:pipeline];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst);
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
GGML_ASSERT(n % 4 == 0);
|
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
|
||||||
} break;
|
} break;
|
||||||
case GGML_UNARY_OP_GELU_QUICK:
|
case GGML_UNARY_OP_GELU_QUICK:
|
||||||
{
|
{
|
||||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline;
|
int64_t n = ggml_nelements(dst);
|
||||||
|
|
||||||
|
id<MTLComputePipelineState> pipeline = nil;
|
||||||
|
|
||||||
|
if (n % 4 == 0) {
|
||||||
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK_4].pipeline;
|
||||||
|
n /= 4;
|
||||||
|
} else {
|
||||||
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline;
|
||||||
|
}
|
||||||
|
|
||||||
[encoder setComputePipelineState:pipeline];
|
[encoder setComputePipelineState:pipeline];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst);
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
GGML_ASSERT(n % 4 == 0);
|
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
|
||||||
} break;
|
} break;
|
||||||
case GGML_UNARY_OP_SILU:
|
case GGML_UNARY_OP_SILU:
|
||||||
{
|
{
|
||||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline;
|
int64_t n = ggml_nelements(dst);
|
||||||
|
|
||||||
|
id<MTLComputePipelineState> pipeline = nil;
|
||||||
|
|
||||||
|
if (n % 4 == 0) {
|
||||||
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU_4].pipeline;
|
||||||
|
n /= 4;
|
||||||
|
} else {
|
||||||
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline;
|
||||||
|
}
|
||||||
|
|
||||||
[encoder setComputePipelineState:pipeline];
|
[encoder setComputePipelineState:pipeline];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst);
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
GGML_ASSERT(n % 4 == 0);
|
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
|
@ -1944,7 +1993,12 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
{
|
{
|
||||||
nth0 = 4;
|
nth0 = 4;
|
||||||
nth1 = 16;
|
nth1 = 16;
|
||||||
|
#if QK_K == 64
|
||||||
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
|
||||||
|
#else
|
||||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
|
||||||
|
#endif
|
||||||
|
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
|
|
1358
ggml-metal.metal
1358
ggml-metal.metal
File diff suppressed because it is too large
Load diff
312
ggml-quants.c
312
ggml-quants.c
File diff suppressed because it is too large
Load diff
148
ggml-quants.h
148
ggml-quants.h
|
@ -12,70 +12,70 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Quantization
|
// Quantization
|
||||||
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
|
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
|
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
|
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
|
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
|
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
|
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
|
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
|
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
|
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
|
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
|
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
|
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int k);
|
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int k);
|
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int k);
|
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
// Dequantization
|
// Dequantization
|
||||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
// Dot product
|
// Dot product
|
||||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
@ -101,26 +101,26 @@ void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
||||||
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||||
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
||||||
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
||||||
void iq2xs_init_impl(enum ggml_type type);
|
void iq2xs_init_impl(enum ggml_type type);
|
||||||
void iq2xs_free_impl(enum ggml_type type);
|
void iq2xs_free_impl(enum ggml_type type);
|
||||||
|
|
1171
ggml-sycl.cpp
1171
ggml-sycl.cpp
File diff suppressed because it is too large
Load diff
133
ggml.c
133
ggml.c
|
@ -4,6 +4,7 @@
|
||||||
#include "ggml-impl.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-quants.h"
|
#include "ggml-quants.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "sgemm.h"
|
||||||
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||||
|
@ -32,6 +33,10 @@
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __ARM_FEATURE_MATMUL_INT8
|
||||||
|
#undef GGML_USE_LLAMAFILE
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
// disable "possible loss of data" to avoid hundreds of casts
|
// disable "possible loss of data" to avoid hundreds of casts
|
||||||
// we should just be careful :)
|
// we should just be careful :)
|
||||||
|
@ -338,14 +343,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
||||||
return GGML_FP32_TO_FP16(x);
|
return GGML_FP32_TO_FP16(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) {
|
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
|
||||||
for (int i = 0; i < n; i++) {
|
for (int64_t i = 0; i < n; i++) {
|
||||||
y[i] = GGML_FP16_TO_FP32(x[i]);
|
y[i] = GGML_FP16_TO_FP32(x[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
|
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
|
||||||
int i = 0;
|
int64_t i = 0;
|
||||||
#if defined(__F16C__)
|
#if defined(__F16C__)
|
||||||
for (; i + 7 < n; i += 8) {
|
for (; i + 7 < n; i += 8) {
|
||||||
__m256 x_vec = _mm256_loadu_ps(x + i);
|
__m256 x_vec = _mm256_loadu_ps(x + i);
|
||||||
|
@ -10928,6 +10933,28 @@ static void ggml_compute_forward_mul_mat(
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if GGML_USE_LLAMAFILE
|
||||||
|
if (nb10 == ggml_type_size(src1->type)) {
|
||||||
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||||
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||||
|
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||||
|
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
||||||
|
nb01/ggml_type_size(src0->type),
|
||||||
|
(const char *)src1->data + i12*nb12 + i13*nb13,
|
||||||
|
nb11/ggml_type_size(src1->type),
|
||||||
|
(char *)dst->data + i12*nb2 + i13*nb3,
|
||||||
|
nb1/ggml_type_size(dst->type),
|
||||||
|
ith, nth,
|
||||||
|
params->type,
|
||||||
|
src0->type,
|
||||||
|
src1->type,
|
||||||
|
dst->type))
|
||||||
|
goto UseGgmlGemm1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
UseGgmlGemm1:;
|
||||||
|
#endif
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT) {
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
||||||
if (ith != 0) {
|
if (ith != 0) {
|
||||||
return;
|
return;
|
||||||
|
@ -10959,6 +10986,30 @@ static void ggml_compute_forward_mul_mat(
|
||||||
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||||
|
|
||||||
|
#if GGML_USE_LLAMAFILE
|
||||||
|
if (nb10 == ggml_type_size(src1->type) || src1->type != vec_dot_type) {
|
||||||
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||||
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||||
|
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||||
|
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
||||||
|
nb01/ggml_type_size(src0->type),
|
||||||
|
(const char *)wdata + ggml_row_size(vec_dot_type,
|
||||||
|
nb12/ggml_type_size(src1->type)*i12 +
|
||||||
|
nb13/ggml_type_size(src1->type)*i13),
|
||||||
|
row_size/ggml_type_size(vec_dot_type),
|
||||||
|
(char *)dst->data + i12*nb2 + i13*nb3,
|
||||||
|
nb1/ggml_type_size(dst->type),
|
||||||
|
ith, nth,
|
||||||
|
params->type,
|
||||||
|
src0->type,
|
||||||
|
vec_dot_type,
|
||||||
|
dst->type))
|
||||||
|
goto UseGgmlGemm2;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
UseGgmlGemm2:;
|
||||||
|
#endif
|
||||||
|
|
||||||
const int64_t nr0 = ne01; // src0 rows
|
const int64_t nr0 = ne01; // src0 rows
|
||||||
const int64_t nr1 = ne1*ne12*ne13; // src1 rows
|
const int64_t nr1 = ne1*ne12*ne13; // src1 rows
|
||||||
|
|
||||||
|
@ -11130,7 +11181,6 @@ static void ggml_compute_forward_mul_mat_id(
|
||||||
}
|
}
|
||||||
|
|
||||||
// initialize matrix_row_counts
|
// initialize matrix_row_counts
|
||||||
GGML_ASSERT(wdata == wdata_src1_end);
|
|
||||||
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
|
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
|
||||||
|
|
||||||
// group rows by src0 matrix
|
// group rows by src0 matrix
|
||||||
|
@ -20654,11 +20704,11 @@ size_t ggml_quantize_chunk(
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
const float * src,
|
const float * src,
|
||||||
void * dst,
|
void * dst,
|
||||||
int start,
|
int64_t start,
|
||||||
int nrows,
|
int64_t nrows,
|
||||||
int n_per_row,
|
int64_t n_per_row,
|
||||||
const float * imatrix) {
|
const float * imatrix) {
|
||||||
const int n = nrows * n_per_row;
|
const int64_t n = (int64_t) nrows * n_per_row;
|
||||||
|
|
||||||
if (ggml_quantize_requires_imatrix(type)) {
|
if (ggml_quantize_requires_imatrix(type)) {
|
||||||
GGML_ASSERT(imatrix != NULL);
|
GGML_ASSERT(imatrix != NULL);
|
||||||
|
@ -20873,6 +20923,32 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
||||||
return ok;
|
return ok;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void gguf_free_kv(struct gguf_kv * kv) {
|
||||||
|
if (kv->key.data) {
|
||||||
|
GGML_FREE(kv->key.data);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (kv->type == GGUF_TYPE_STRING) {
|
||||||
|
if (kv->value.str.data) {
|
||||||
|
GGML_FREE(kv->value.str.data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (kv->type == GGUF_TYPE_ARRAY) {
|
||||||
|
if (kv->value.arr.data) {
|
||||||
|
if (kv->value.arr.type == GGUF_TYPE_STRING) {
|
||||||
|
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
||||||
|
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
||||||
|
if (str->data) {
|
||||||
|
GGML_FREE(str->data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
GGML_FREE(kv->value.arr.data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct gguf_context * gguf_init_empty(void) {
|
struct gguf_context * gguf_init_empty(void) {
|
||||||
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
||||||
|
|
||||||
|
@ -21222,31 +21298,7 @@ void gguf_free(struct gguf_context * ctx) {
|
||||||
if (ctx->kv) {
|
if (ctx->kv) {
|
||||||
// free string memory - not great..
|
// free string memory - not great..
|
||||||
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
||||||
struct gguf_kv * kv = &ctx->kv[i];
|
gguf_free_kv(&ctx->kv[i]);
|
||||||
|
|
||||||
if (kv->key.data) {
|
|
||||||
GGML_FREE(kv->key.data);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (kv->type == GGUF_TYPE_STRING) {
|
|
||||||
if (kv->value.str.data) {
|
|
||||||
GGML_FREE(kv->value.str.data);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (kv->type == GGUF_TYPE_ARRAY) {
|
|
||||||
if (kv->value.arr.data) {
|
|
||||||
if (kv->value.arr.type == GGUF_TYPE_STRING) {
|
|
||||||
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
|
||||||
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
|
||||||
if (str->data) {
|
|
||||||
GGML_FREE(str->data);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
GGML_FREE(kv->value.arr.data);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_FREE(ctx->kv);
|
GGML_FREE(ctx->kv);
|
||||||
|
@ -21471,6 +21523,19 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
||||||
return n_kv;
|
return n_kv;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void gguf_remove_key(struct gguf_context * ctx, const char * key) {
|
||||||
|
const int idx = gguf_find_key(ctx, key);
|
||||||
|
if (idx >= 0) {
|
||||||
|
const int n_kv = gguf_get_n_kv(ctx);
|
||||||
|
gguf_free_kv(&ctx->kv[idx]);
|
||||||
|
for (int i = idx; i < n_kv-1; ++i) {
|
||||||
|
ctx->kv[i] = ctx->kv[i+1];
|
||||||
|
}
|
||||||
|
ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv));
|
||||||
|
ctx->header.n_kv--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
|
void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
|
||||||
const int idx = gguf_get_or_add_key(ctx, key);
|
const int idx = gguf_get_or_add_key(ctx, key);
|
||||||
|
|
||||||
|
|
17
ggml.h
17
ggml.h
|
@ -332,8 +332,8 @@ extern "C" {
|
||||||
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
||||||
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
||||||
|
|
||||||
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
|
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
|
||||||
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
|
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
|
||||||
|
|
||||||
struct ggml_object;
|
struct ggml_object;
|
||||||
struct ggml_context;
|
struct ggml_context;
|
||||||
|
@ -2230,9 +2230,9 @@ extern "C" {
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
const float * src,
|
const float * src,
|
||||||
void * dst,
|
void * dst,
|
||||||
int start,
|
int64_t start,
|
||||||
int nrows,
|
int64_t nrows,
|
||||||
int n_per_row,
|
int64_t n_per_row,
|
||||||
const float * imatrix);
|
const float * imatrix);
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -2309,6 +2309,9 @@ extern "C" {
|
||||||
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
||||||
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
|
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
|
||||||
|
|
||||||
|
// removes key if it exists
|
||||||
|
GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
|
||||||
|
|
||||||
// overrides existing values or adds a new one
|
// overrides existing values or adds a new one
|
||||||
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
||||||
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|
||||||
|
@ -2397,8 +2400,8 @@ extern "C" {
|
||||||
#else
|
#else
|
||||||
#define GGML_RESTRICT restrict
|
#define GGML_RESTRICT restrict
|
||||||
#endif
|
#endif
|
||||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||||
|
|
||||||
|
|
|
@ -24,6 +24,7 @@ class Keys:
|
||||||
ALIGNMENT = "general.alignment"
|
ALIGNMENT = "general.alignment"
|
||||||
NAME = "general.name"
|
NAME = "general.name"
|
||||||
AUTHOR = "general.author"
|
AUTHOR = "general.author"
|
||||||
|
VERSION = "general.version"
|
||||||
URL = "general.url"
|
URL = "general.url"
|
||||||
DESCRIPTION = "general.description"
|
DESCRIPTION = "general.description"
|
||||||
LICENSE = "general.license"
|
LICENSE = "general.license"
|
||||||
|
@ -89,6 +90,11 @@ class Keys:
|
||||||
HF_JSON = "tokenizer.huggingface.json"
|
HF_JSON = "tokenizer.huggingface.json"
|
||||||
RWKV = "tokenizer.rwkv.world"
|
RWKV = "tokenizer.rwkv.world"
|
||||||
CHAT_TEMPLATE = "tokenizer.chat_template"
|
CHAT_TEMPLATE = "tokenizer.chat_template"
|
||||||
|
# FIM/Infill special tokens constants
|
||||||
|
PREFIX_ID = "tokenizer.ggml.prefix_token_id"
|
||||||
|
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
|
||||||
|
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
|
||||||
|
EOT_ID = "tokenizer.ggml.eot_token_id"
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -114,6 +120,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
STABLELM = auto()
|
STABLELM = auto()
|
||||||
QWEN = auto()
|
QWEN = auto()
|
||||||
QWEN2 = auto()
|
QWEN2 = auto()
|
||||||
|
QWEN2MOE = auto()
|
||||||
PHI2 = auto()
|
PHI2 = auto()
|
||||||
PLAMO = auto()
|
PLAMO = auto()
|
||||||
CODESHELL = auto()
|
CODESHELL = auto()
|
||||||
|
@ -125,6 +132,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
MAMBA = auto()
|
MAMBA = auto()
|
||||||
XVERSE = auto()
|
XVERSE = auto()
|
||||||
COMMAND_R = auto()
|
COMMAND_R = auto()
|
||||||
|
DBRX = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
|
@ -145,6 +153,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
ATTN_OUT_NORM = auto()
|
ATTN_OUT_NORM = auto()
|
||||||
ATTN_ROT_EMBD = auto()
|
ATTN_ROT_EMBD = auto()
|
||||||
FFN_GATE_INP = auto()
|
FFN_GATE_INP = auto()
|
||||||
|
FFN_GATE_INP_SHEXP = auto()
|
||||||
FFN_NORM = auto()
|
FFN_NORM = auto()
|
||||||
FFN_GATE = auto()
|
FFN_GATE = auto()
|
||||||
FFN_DOWN = auto()
|
FFN_DOWN = auto()
|
||||||
|
@ -153,6 +162,9 @@ class MODEL_TENSOR(IntEnum):
|
||||||
FFN_GATE_EXP = auto()
|
FFN_GATE_EXP = auto()
|
||||||
FFN_DOWN_EXP = auto()
|
FFN_DOWN_EXP = auto()
|
||||||
FFN_UP_EXP = auto()
|
FFN_UP_EXP = auto()
|
||||||
|
FFN_GATE_SHEXP = auto()
|
||||||
|
FFN_DOWN_SHEXP = auto()
|
||||||
|
FFN_UP_SHEXP = auto()
|
||||||
ATTN_Q_NORM = auto()
|
ATTN_Q_NORM = auto()
|
||||||
ATTN_K_NORM = auto()
|
ATTN_K_NORM = auto()
|
||||||
LAYER_OUT_NORM = auto()
|
LAYER_OUT_NORM = auto()
|
||||||
|
@ -183,6 +195,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.STABLELM: "stablelm",
|
MODEL_ARCH.STABLELM: "stablelm",
|
||||||
MODEL_ARCH.QWEN: "qwen",
|
MODEL_ARCH.QWEN: "qwen",
|
||||||
MODEL_ARCH.QWEN2: "qwen2",
|
MODEL_ARCH.QWEN2: "qwen2",
|
||||||
|
MODEL_ARCH.QWEN2MOE: "qwen2moe",
|
||||||
MODEL_ARCH.PHI2: "phi2",
|
MODEL_ARCH.PHI2: "phi2",
|
||||||
MODEL_ARCH.PLAMO: "plamo",
|
MODEL_ARCH.PLAMO: "plamo",
|
||||||
MODEL_ARCH.CODESHELL: "codeshell",
|
MODEL_ARCH.CODESHELL: "codeshell",
|
||||||
|
@ -194,6 +207,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.MAMBA: "mamba",
|
MODEL_ARCH.MAMBA: "mamba",
|
||||||
MODEL_ARCH.XVERSE: "xverse",
|
MODEL_ARCH.XVERSE: "xverse",
|
||||||
MODEL_ARCH.COMMAND_R: "command-r",
|
MODEL_ARCH.COMMAND_R: "command-r",
|
||||||
|
MODEL_ARCH.DBRX: "dbrx",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
|
@ -216,10 +230,14 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||||
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
||||||
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||||
|
MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
|
||||||
|
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
||||||
|
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
||||||
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
||||||
|
@ -437,6 +455,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_GATE,
|
MODEL_TENSOR.FFN_GATE,
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
],
|
],
|
||||||
MODEL_ARCH.QWEN: [
|
MODEL_ARCH.QWEN: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
@ -466,6 +486,25 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.QWEN2MOE: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
|
],
|
||||||
MODEL_ARCH.PLAMO: [
|
MODEL_ARCH.PLAMO: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
@ -638,6 +677,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_GATE,
|
MODEL_TENSOR.FFN_GATE,
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.DBRX: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
],
|
],
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
@ -867,3 +921,7 @@ KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
|
||||||
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
|
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
|
||||||
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
|
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
|
||||||
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
|
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
|
||||||
|
KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID
|
||||||
|
KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
|
||||||
|
KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID
|
||||||
|
KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
|
||||||
|
|
|
@ -296,6 +296,9 @@ class GGUFWriter:
|
||||||
def add_author(self, author: str) -> None:
|
def add_author(self, author: str) -> None:
|
||||||
self.add_string(Keys.General.AUTHOR, author)
|
self.add_string(Keys.General.AUTHOR, author)
|
||||||
|
|
||||||
|
def add_version(self, version: str) -> None:
|
||||||
|
self.add_string(Keys.General.VERSION, version)
|
||||||
|
|
||||||
def add_tensor_data_layout(self, layout: str) -> None:
|
def add_tensor_data_layout(self, layout: str) -> None:
|
||||||
self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||||
|
|
||||||
|
@ -305,6 +308,9 @@ class GGUFWriter:
|
||||||
def add_description(self, description: str) -> None:
|
def add_description(self, description: str) -> None:
|
||||||
self.add_string(Keys.General.DESCRIPTION, description)
|
self.add_string(Keys.General.DESCRIPTION, description)
|
||||||
|
|
||||||
|
def add_licence(self, licence: str) -> None:
|
||||||
|
self.add_string(Keys.General.LICENSE, licence)
|
||||||
|
|
||||||
def add_source_url(self, url: str) -> None:
|
def add_source_url(self, url: str) -> None:
|
||||||
self.add_string(Keys.General.SOURCE_URL, url)
|
self.add_string(Keys.General.SOURCE_URL, url)
|
||||||
|
|
||||||
|
@ -463,6 +469,18 @@ class GGUFWriter:
|
||||||
def add_chat_template(self, value: str) -> None:
|
def add_chat_template(self, value: str) -> None:
|
||||||
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
|
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
|
||||||
|
|
||||||
|
def add_prefix_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.PREFIX_ID, id)
|
||||||
|
|
||||||
|
def add_suffix_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id)
|
||||||
|
|
||||||
|
def add_middle_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id)
|
||||||
|
|
||||||
|
def add_eot_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.EOT_ID, id)
|
||||||
|
|
||||||
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
||||||
pack_prefix = ''
|
pack_prefix = ''
|
||||||
if not skip_pack_prefix:
|
if not skip_pack_prefix:
|
||||||
|
|
|
@ -10,7 +10,7 @@ class TensorNameMap:
|
||||||
# Token embeddings
|
# Token embeddings
|
||||||
MODEL_TENSOR.TOKEN_EMBD: (
|
MODEL_TENSOR.TOKEN_EMBD: (
|
||||||
"gpt_neox.embed_in", # gptneox
|
"gpt_neox.embed_in", # gptneox
|
||||||
"transformer.wte", # gpt2 gpt-j mpt refact qwen
|
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx
|
||||||
"transformer.word_embeddings", # falcon
|
"transformer.word_embeddings", # falcon
|
||||||
"word_embeddings", # bloom
|
"word_embeddings", # bloom
|
||||||
"model.embed_tokens", # llama-hf
|
"model.embed_tokens", # llama-hf
|
||||||
|
@ -48,7 +48,7 @@ class TensorNameMap:
|
||||||
# Output
|
# Output
|
||||||
MODEL_TENSOR.OUTPUT: (
|
MODEL_TENSOR.OUTPUT: (
|
||||||
"embed_out", # gptneox
|
"embed_out", # gptneox
|
||||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba
|
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx
|
||||||
"output", # llama-pth bloom internlm2
|
"output", # llama-pth bloom internlm2
|
||||||
"word_embeddings_for_head", # persimmon
|
"word_embeddings_for_head", # persimmon
|
||||||
"lm_head.linear", # phi2
|
"lm_head.linear", # phi2
|
||||||
|
@ -60,7 +60,7 @@ class TensorNameMap:
|
||||||
"transformer.ln_f", # gpt2 gpt-j falcon
|
"transformer.ln_f", # gpt2 gpt-j falcon
|
||||||
"model.norm", # llama-hf baichuan internlm2
|
"model.norm", # llama-hf baichuan internlm2
|
||||||
"norm", # llama-pth
|
"norm", # llama-pth
|
||||||
"transformer.norm_f", # mpt
|
"transformer.norm_f", # mpt dbrx
|
||||||
"ln_f", # refact bloom qwen gpt2
|
"ln_f", # refact bloom qwen gpt2
|
||||||
"language_model.encoder.final_layernorm", # persimmon
|
"language_model.encoder.final_layernorm", # persimmon
|
||||||
"model.final_layernorm", # persimmon
|
"model.final_layernorm", # persimmon
|
||||||
|
@ -96,6 +96,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.norm", # mamba-qbert
|
"model.layers.{bid}.norm", # mamba-qbert
|
||||||
"backbone.layers.{bid}.norm", # mamba
|
"backbone.layers.{bid}.norm", # mamba
|
||||||
"transformer.decoder_layer.{bid}.rms_norm", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm", # Grok
|
||||||
|
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention norm 2
|
# Attention norm 2
|
||||||
|
@ -108,6 +109,7 @@ class TensorNameMap:
|
||||||
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
||||||
"transformer.h.{bid}.attn.c_attn", # gpt2 qwen
|
"transformer.h.{bid}.attn.c_attn", # gpt2 qwen
|
||||||
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
||||||
|
"transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", # dbrx
|
||||||
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||||
"h.{bid}.self_attention.query_key_value", # bloom
|
"h.{bid}.self_attention.query_key_value", # bloom
|
||||||
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
||||||
|
@ -168,7 +170,8 @@ class TensorNameMap:
|
||||||
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
||||||
"model.layers.{bid}.attention.wo", # internlm2
|
"model.layers.{bid}.attention.wo", # internlm2
|
||||||
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
|
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.linear"# Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
|
||||||
|
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention output norm
|
# Attention output norm
|
||||||
|
@ -176,6 +179,7 @@ class TensorNameMap:
|
||||||
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||||
"encoder.layers.{bid}.norm1", # nomic-bert
|
"encoder.layers.{bid}.norm1", # nomic-bert
|
||||||
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
|
||||||
|
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
# Rotary embeddings
|
# Rotary embeddings
|
||||||
|
@ -204,7 +208,13 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.FFN_GATE_INP: (
|
MODEL_TENSOR.FFN_GATE_INP: (
|
||||||
"layers.{bid}.feed_forward.gate", # mixtral
|
"layers.{bid}.feed_forward.gate", # mixtral
|
||||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
||||||
"transformer.decoder_layer.{bid}.router" # Grok
|
"model.layers.{bid}.mlp.gate", # qwen2moe
|
||||||
|
"transformer.decoder_layer.{bid}.router", # Grok
|
||||||
|
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
||||||
|
"model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward up
|
# Feed-forward up
|
||||||
|
@ -233,6 +243,12 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.FFN_UP_EXP: (
|
MODEL_TENSOR.FFN_UP_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
||||||
|
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
||||||
|
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe (merged)
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_UP_SHEXP: (
|
||||||
|
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
|
||||||
),
|
),
|
||||||
|
|
||||||
# AWQ-activation gate
|
# AWQ-activation gate
|
||||||
|
@ -252,7 +268,13 @@ class TensorNameMap:
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear" # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
||||||
|
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
||||||
|
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe (merged)
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
||||||
|
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward down
|
# Feed-forward down
|
||||||
|
@ -280,17 +302,25 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
||||||
|
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
||||||
|
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe (merged)
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||||
|
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||||
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
||||||
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
||||||
|
"model.layers.{bid}.self_attn.q_norm", # cohere
|
||||||
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.ATTN_K_NORM: (
|
MODEL_TENSOR.ATTN_K_NORM: (
|
||||||
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
||||||
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
||||||
|
"model.layers.{bid}.self_attn.k_norm", # cohere
|
||||||
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -356,7 +386,7 @@ class TensorNameMap:
|
||||||
if tensor not in MODEL_TENSORS[arch]:
|
if tensor not in MODEL_TENSORS[arch]:
|
||||||
continue
|
continue
|
||||||
# TODO: make this configurable
|
# TODO: make this configurable
|
||||||
n_experts = 8
|
n_experts = 60
|
||||||
for xid in range(n_experts):
|
for xid in range(n_experts):
|
||||||
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
|
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
|
||||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||||
|
|
|
@ -89,3 +89,13 @@ This guide provides a brief overview. Check out the GBNF files in this directory
|
||||||
```
|
```
|
||||||
./main -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
|
./main -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
Grammars currently have performance gotchas (see https://github.com/ggerganov/llama.cpp/issues/4218).
|
||||||
|
|
||||||
|
### Efficient optional repetitions
|
||||||
|
|
||||||
|
A common pattern is to allow repetitions of a pattern `x` up to N times.
|
||||||
|
|
||||||
|
While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) will result in extremely slow inference. Instead, you can write `(x (x (x ... (x)?...)?)?)?` (w/ N-deep nesting)
|
||||||
|
|
94
llama.h
94
llama.h
|
@ -37,10 +37,14 @@
|
||||||
|
|
||||||
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||||
|
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
|
||||||
|
|
||||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
#define LLAMA_SESSION_VERSION 5
|
#define LLAMA_SESSION_VERSION 5
|
||||||
|
|
||||||
|
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
|
||||||
|
#define LLAMA_STATE_SEQ_VERSION 1
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
@ -523,6 +527,7 @@ extern "C" {
|
||||||
struct llama_context * ctx);
|
struct llama_context * ctx);
|
||||||
|
|
||||||
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||||
|
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
||||||
// seq_id < 0 : match any sequence
|
// seq_id < 0 : match any sequence
|
||||||
// p0 < 0 : [0, p1]
|
// p0 < 0 : [0, p1]
|
||||||
// p1 < 0 : [p0, inf)
|
// p1 < 0 : [p0, inf)
|
||||||
|
@ -594,34 +599,92 @@ extern "C" {
|
||||||
|
|
||||||
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
||||||
// and kv_cache) - will often be smaller after compacting tokens
|
// and kv_cache) - will often be smaller after compacting tokens
|
||||||
LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
|
LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
|
||||||
|
LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
|
||||||
|
"use llama_state_get_size instead");
|
||||||
|
|
||||||
// Copies the state to the specified destination address.
|
// Copies the state to the specified destination address.
|
||||||
// Destination needs to have allocated enough memory.
|
// Destination needs to have allocated enough memory.
|
||||||
// Returns the number of bytes copied
|
// Returns the number of bytes copied
|
||||||
LLAMA_API size_t llama_copy_state_data(
|
LLAMA_API size_t llama_state_get_data(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
uint8_t * dst);
|
uint8_t * dst);
|
||||||
|
LLAMA_API DEPRECATED(size_t llama_copy_state_data(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
uint8_t * dst),
|
||||||
|
"use llama_state_get_data instead");
|
||||||
|
|
||||||
// Set the state reading from the specified address
|
// Set the state reading from the specified address
|
||||||
// Returns the number of bytes read
|
// Returns the number of bytes read
|
||||||
LLAMA_API size_t llama_set_state_data(
|
LLAMA_API size_t llama_state_set_data(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
const uint8_t * src);
|
const uint8_t * src);
|
||||||
|
LLAMA_API DEPRECATED(size_t llama_set_state_data(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
const uint8_t * src),
|
||||||
|
"use llama_state_set_data instead");
|
||||||
|
|
||||||
// Save/load session file
|
// Save/load session file
|
||||||
LLAMA_API bool llama_load_session_file(
|
LLAMA_API bool llama_state_load_file(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
const char * path_session,
|
const char * path_session,
|
||||||
llama_token * tokens_out,
|
llama_token * tokens_out,
|
||||||
size_t n_token_capacity,
|
size_t n_token_capacity,
|
||||||
size_t * n_token_count_out);
|
size_t * n_token_count_out);
|
||||||
|
LLAMA_API DEPRECATED(bool llama_load_session_file(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
const char * path_session,
|
||||||
|
llama_token * tokens_out,
|
||||||
|
size_t n_token_capacity,
|
||||||
|
size_t * n_token_count_out),
|
||||||
|
"use llama_state_load_file instead");
|
||||||
|
|
||||||
LLAMA_API bool llama_save_session_file(
|
LLAMA_API bool llama_state_save_file(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
const char * path_session,
|
const char * path_session,
|
||||||
const llama_token * tokens,
|
const llama_token * tokens,
|
||||||
size_t n_token_count);
|
size_t n_token_count);
|
||||||
|
LLAMA_API DEPRECATED(bool llama_save_session_file(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
const char * path_session,
|
||||||
|
const llama_token * tokens,
|
||||||
|
size_t n_token_count),
|
||||||
|
"use llama_state_save_file instead");
|
||||||
|
|
||||||
|
// Get the exact size needed to copy the KV cache of a single sequence
|
||||||
|
LLAMA_API size_t llama_state_seq_get_size(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
llama_seq_id seq_id);
|
||||||
|
|
||||||
|
// Copy the KV cache of a single sequence into the specified buffer
|
||||||
|
LLAMA_API size_t llama_state_seq_get_data(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
uint8_t * dst,
|
||||||
|
llama_seq_id seq_id);
|
||||||
|
|
||||||
|
// Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
|
||||||
|
// Returns:
|
||||||
|
// - Positive: Ok
|
||||||
|
// - Zero: Failed to load
|
||||||
|
LLAMA_API size_t llama_state_seq_set_data(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
const uint8_t * src,
|
||||||
|
llama_seq_id dest_seq_id);
|
||||||
|
|
||||||
|
LLAMA_API size_t llama_state_seq_save_file(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
const char * filepath,
|
||||||
|
llama_seq_id seq_id,
|
||||||
|
const llama_token * tokens,
|
||||||
|
size_t n_token_count);
|
||||||
|
|
||||||
|
LLAMA_API size_t llama_state_seq_load_file(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
const char * filepath,
|
||||||
|
llama_seq_id dest_seq_id,
|
||||||
|
llama_token * tokens_out,
|
||||||
|
size_t n_token_capacity,
|
||||||
|
size_t * n_token_count_out);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Decoding
|
// Decoding
|
||||||
|
@ -684,8 +747,9 @@ extern "C" {
|
||||||
// Cols: n_vocab
|
// Cols: n_vocab
|
||||||
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
||||||
|
|
||||||
// Logits for the ith token. Equivalent to:
|
// Logits for the ith token. For positive indices, Equivalent to:
|
||||||
// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
||||||
|
// Negative indicies can be used to access logits in reverse order, -1 is the last logit.
|
||||||
// returns NULL for invalid ids.
|
// returns NULL for invalid ids.
|
||||||
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
||||||
|
|
||||||
|
@ -697,8 +761,9 @@ extern "C" {
|
||||||
// Otherwise, returns NULL.
|
// Otherwise, returns NULL.
|
||||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||||
|
|
||||||
// Get the embeddings for the ith token. Equivalent to:
|
// Get the embeddings for the ith token. For positive indices, Equivalent to:
|
||||||
// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
||||||
|
// Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
|
||||||
// shape: [n_embd] (1-dimensional)
|
// shape: [n_embd] (1-dimensional)
|
||||||
// returns NULL for invalid ids.
|
// returns NULL for invalid ids.
|
||||||
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
||||||
|
@ -721,6 +786,8 @@ extern "C" {
|
||||||
// Special tokens
|
// Special tokens
|
||||||
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
||||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||||
|
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
|
||||||
|
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
|
||||||
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
||||||
|
|
||||||
// Returns -1 if unknown, 1 for true or 0 for false.
|
// Returns -1 if unknown, 1 for true or 0 for false.
|
||||||
|
@ -743,16 +810,16 @@ extern "C" {
|
||||||
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
||||||
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
||||||
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
||||||
/// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
|
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
||||||
/// Does not insert a leading space.
|
/// as plaintext. Does not insert a leading space.
|
||||||
LLAMA_API int32_t llama_tokenize(
|
LLAMA_API int32_t llama_tokenize(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const char * text,
|
const char * text,
|
||||||
int32_t text_len,
|
int32_t text_len,
|
||||||
llama_token * tokens,
|
llama_token * tokens,
|
||||||
int32_t n_tokens_max,
|
int32_t n_tokens_max,
|
||||||
bool add_bos,
|
bool add_special,
|
||||||
bool special);
|
bool parse_special);
|
||||||
|
|
||||||
// Token Id -> Piece.
|
// Token Id -> Piece.
|
||||||
// Uses the vocabulary in the provided context.
|
// Uses the vocabulary in the provided context.
|
||||||
|
@ -1030,10 +1097,11 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
||||||
struct llama_context * ctx
|
struct llama_context * ctx
|
||||||
);
|
);
|
||||||
|
|
||||||
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
void llama_grammar_accept(
|
||||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||||
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
||||||
const uint32_t chr);
|
const uint32_t chr,
|
||||||
|
std::vector<std::vector<const llama_grammar_element *>> & new_stacks);
|
||||||
|
|
||||||
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
||||||
const std::string & src,
|
const std::string & src,
|
||||||
|
|
9
scripts/gen-authors.sh
Executable file
9
scripts/gen-authors.sh
Executable file
|
@ -0,0 +1,9 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
printf "# date: $(date)\n" > AUTHORS
|
||||||
|
printf "# this file is auto-generated by scripts/gen-authors.sh\n\n" >> AUTHORS
|
||||||
|
|
||||||
|
git log --format='%an <%ae>' --reverse --date=short master | awk '!seen[$0]++' | sort >> AUTHORS
|
||||||
|
|
||||||
|
# if necessary, update your name here. for example: jdoe -> John Doe
|
||||||
|
sed -i '' 's/^jdoe/John Doe/g' AUTHORS
|
|
@ -1,10 +1,11 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
|
unzip wikitext-2-raw-v1.zip
|
||||||
|
|
||||||
echo "Usage:"
|
echo "Usage:"
|
||||||
echo ""
|
echo ""
|
||||||
echo " ./perplexity -m model.gguf -f wiki.test.raw [other params]"
|
echo " ./perplexity -m model.gguf -f wikitext-2-raw/wiki.test.raw [other params]"
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
exit 0
|
exit 0
|
||||||
|
|
|
@ -3,9 +3,9 @@
|
||||||
# Shortcut for downloading HF models
|
# Shortcut for downloading HF models
|
||||||
#
|
#
|
||||||
# Usage:
|
# Usage:
|
||||||
# ./main -m $(./examples/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
# ./main -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||||
# ./main -m $(./examples/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
# ./main -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||||
# ./main -m $(./examples/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
# ./main -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||||
#
|
#
|
||||||
|
|
||||||
# all logs go to stderr
|
# all logs go to stderr
|
||||||
|
@ -14,7 +14,7 @@ function log {
|
||||||
}
|
}
|
||||||
|
|
||||||
function usage {
|
function usage {
|
||||||
log "Usage: $0 [[--url] <url>] [--repo <repo>] [--file <file>] [-h|--help]"
|
log "Usage: $0 [[--url] <url>] [--repo <repo>] [--file <file>] [--outdir <dir> [-h|--help]"
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -26,9 +26,9 @@ function has_cmd {
|
||||||
}
|
}
|
||||||
|
|
||||||
if has_cmd wget; then
|
if has_cmd wget; then
|
||||||
cmd="wget -q --show-progress -c -O %s %s"
|
cmd="wget -q --show-progress -c -O %s/%s %s"
|
||||||
elif has_cmd curl; then
|
elif has_cmd curl; then
|
||||||
cmd="curl -C - -f -o %s -L %s"
|
cmd="curl -C - -f --output-dir %s -o %s -L %s"
|
||||||
else
|
else
|
||||||
log "[E] curl or wget not found"
|
log "[E] curl or wget not found"
|
||||||
exit 1
|
exit 1
|
||||||
|
@ -37,6 +37,7 @@ fi
|
||||||
url=""
|
url=""
|
||||||
repo=""
|
repo=""
|
||||||
file=""
|
file=""
|
||||||
|
outdir="."
|
||||||
|
|
||||||
# parse args
|
# parse args
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
|
@ -53,6 +54,10 @@ while [[ $# -gt 0 ]]; do
|
||||||
file="$2"
|
file="$2"
|
||||||
shift 2
|
shift 2
|
||||||
;;
|
;;
|
||||||
|
--outdir)
|
||||||
|
outdir="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
-h|--help)
|
-h|--help)
|
||||||
usage
|
usage
|
||||||
;;
|
;;
|
||||||
|
@ -94,10 +99,10 @@ basename=$(basename $url)
|
||||||
log "[+] attempting to download $basename"
|
log "[+] attempting to download $basename"
|
||||||
|
|
||||||
if [ -n "$cmd" ]; then
|
if [ -n "$cmd" ]; then
|
||||||
cmd=$(printf "$cmd" "$basename" "$url")
|
cmd=$(printf "$cmd" "$outdir" "$basename" "$url")
|
||||||
log "[+] $cmd"
|
log "[+] $cmd"
|
||||||
if $cmd; then
|
if $cmd; then
|
||||||
echo $basename
|
echo $outdir/$basename
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue