Merge remote-tracking branch 'origin/master' into json-faster-repetitions2
This commit is contained in:
commit
958bdda559
55 changed files with 4509 additions and 3231 deletions
22
.github/workflows/build.yml
vendored
22
.github/workflows/build.yml
vendored
|
@ -52,7 +52,7 @@ jobs:
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -L main --verbose --timeout 900
|
ctest -L 'main|curl' --verbose --timeout 900
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
id: tag
|
id: tag
|
||||||
|
@ -101,7 +101,9 @@ jobs:
|
||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
|
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||||
|
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||||
|
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -209,21 +211,21 @@ jobs:
|
||||||
id: depends
|
id: depends
|
||||||
run: |
|
run: |
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install build-essential
|
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON
|
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -L main --verbose --timeout 900
|
ctest -L 'main|curl' --verbose --timeout 900
|
||||||
|
|
||||||
- name: Test llama2c conversion
|
- name: Test llama2c conversion
|
||||||
id: llama2c_test
|
id: llama2c_test
|
||||||
|
@ -938,6 +940,12 @@ jobs:
|
||||||
- name: Download artifacts
|
- name: Download artifacts
|
||||||
id: download-artifact
|
id: download-artifact
|
||||||
uses: actions/download-artifact@v4
|
uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
path: ./artifact
|
||||||
|
|
||||||
|
- name: Move artifacts
|
||||||
|
id: move_artifacts
|
||||||
|
run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
|
||||||
|
|
||||||
- name: Create release
|
- name: Create release
|
||||||
id: create_release
|
id: create_release
|
||||||
|
@ -956,7 +964,7 @@ jobs:
|
||||||
const path = require('path');
|
const path = require('path');
|
||||||
const fs = require('fs');
|
const fs = require('fs');
|
||||||
const release_id = '${{ steps.create_release.outputs.id }}';
|
const release_id = '${{ steps.create_release.outputs.id }}';
|
||||||
for (let file of await fs.readdirSync('./artifact')) {
|
for (let file of await fs.readdirSync('./artifact/release')) {
|
||||||
if (path.extname(file) === '.zip') {
|
if (path.extname(file) === '.zip') {
|
||||||
console.log('uploadReleaseAsset', file);
|
console.log('uploadReleaseAsset', file);
|
||||||
await github.repos.uploadReleaseAsset({
|
await github.repos.uploadReleaseAsset({
|
||||||
|
@ -964,7 +972,7 @@ jobs:
|
||||||
repo: context.repo.repo,
|
repo: context.repo.repo,
|
||||||
release_id: release_id,
|
release_id: release_id,
|
||||||
name: file,
|
name: file,
|
||||||
data: await fs.readFileSync(`./artifact/${file}`)
|
data: await fs.readFileSync(`./artifact/release/${file}`)
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -48,6 +48,7 @@ models-mnt
|
||||||
/convert-llama2c-to-ggml
|
/convert-llama2c-to-ggml
|
||||||
/embd-input-test
|
/embd-input-test
|
||||||
/embedding
|
/embedding
|
||||||
|
/eval-callback
|
||||||
/gguf
|
/gguf
|
||||||
/gguf-llama-simple
|
/gguf-llama-simple
|
||||||
/gguf-split
|
/gguf-split
|
||||||
|
|
655
AUTHORS
Normal file
655
AUTHORS
Normal file
|
@ -0,0 +1,655 @@
|
||||||
|
# date: Tue Apr 9 09:17:14 EEST 2024
|
||||||
|
# this file is auto-generated by scripts/gen-authors.sh
|
||||||
|
|
||||||
|
0cc4m <picard12@live.de>
|
||||||
|
0xspringtime <110655352+0xspringtime@users.noreply.github.com>
|
||||||
|
2f38b454 <dxf@protonmail.com>
|
||||||
|
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
|
||||||
|
44670 <44670@users.noreply.github.com>
|
||||||
|
AN Long <aisk@users.noreply.github.com>
|
||||||
|
AT <manyoso@users.noreply.github.com>
|
||||||
|
Aarni Koskela <akx@iki.fi>
|
||||||
|
Aaron Miller <apage43@ninjawhale.com>
|
||||||
|
Aaryaman Vasishta <aaryaman.vasishta@amd.com>
|
||||||
|
Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
|
||||||
|
Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com>
|
||||||
|
Adithya Balaji <adithya.b94@gmail.com>
|
||||||
|
AdithyanI <adithyan.i4internet@gmail.com>
|
||||||
|
Adrian <smith.adriane@gmail.com>
|
||||||
|
Adrian Hesketh <a-h@users.noreply.github.com>
|
||||||
|
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
|
||||||
|
Aisuko <urakiny@gmail.com>
|
||||||
|
Alberto <57916483+albbus-stack@users.noreply.github.com>
|
||||||
|
Alex <awhill19@icloud.com>
|
||||||
|
Alex Azarov <alex@azarov.by>
|
||||||
|
Alex Azarov <alexander.azarov@mapbox.com>
|
||||||
|
Alex Klinkhamer <from.github.com.917@grencez.dev>
|
||||||
|
Alex Klinkhamer <git@grencez.dev>
|
||||||
|
Alex Nguyen <tiendung@users.noreply.github.com>
|
||||||
|
Alex Petenchea <alex.petenchea@gmail.com>
|
||||||
|
Alex Renda <alexrenda@users.noreply.github.com>
|
||||||
|
Alex von Gluck IV <kallisti5@unixzen.com>
|
||||||
|
Alexey Parfenov <zxed@alkatrazstudio.net>
|
||||||
|
Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
|
||||||
|
Ali Nehzat <ali.nehzat@thanks.dev>
|
||||||
|
Ali Tariq <ali.tariq@10xengineers.ai>
|
||||||
|
Alon <alonfaraj@gmail.com>
|
||||||
|
AlpinDale <52078762+AlpinDale@users.noreply.github.com>
|
||||||
|
AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
|
||||||
|
Ananta Bastola <anantarajbastola@gmail.com>
|
||||||
|
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
|
||||||
|
András Salamon <ott2@users.noreply.github.com>
|
||||||
|
Andrei <abetlen@gmail.com>
|
||||||
|
Andrew Canis <andrew.canis@gmail.com>
|
||||||
|
Andrew Duffy <a10y@users.noreply.github.com>
|
||||||
|
Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
|
||||||
|
Arik Poznanski <arikpoz@users.noreply.github.com>
|
||||||
|
Artem <guinmoon@gmail.com>
|
||||||
|
Artyom Lebedev <vagran.ast@gmail.com>
|
||||||
|
Asbjørn Olling <asbjornolling@gmail.com>
|
||||||
|
Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
|
||||||
|
Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
|
||||||
|
Ashraful Islam <ashraful.meche@gmail.com>
|
||||||
|
Atsushi Tatsuma <yoshoku@outlook.com>
|
||||||
|
Austin <77757836+teleprint-me@users.noreply.github.com>
|
||||||
|
AustinMroz <austinmroz@utexas.edu>
|
||||||
|
BADR <contact@pythops.com>
|
||||||
|
Bach Le <bach@bullno1.com>
|
||||||
|
Bailey Chittle <39804642+bachittle@users.noreply.github.com>
|
||||||
|
BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
|
||||||
|
Behnam M <58621210+ibehnam@users.noreply.github.com>
|
||||||
|
Ben Garney <bengarney@users.noreply.github.com>
|
||||||
|
Ben Siraphob <bensiraphob@gmail.com>
|
||||||
|
Ben Williams <ben@719ben.com>
|
||||||
|
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
|
||||||
|
Bernat Vadell <hounter.caza@gmail.com>
|
||||||
|
Bodo Graumann <mail@bodograumann.de>
|
||||||
|
Bono Lv <lvscar@users.noreply.github.com>
|
||||||
|
Borislav Stanimirov <b.stanimirov@abv.bg>
|
||||||
|
Branden Butler <bwtbutler@hotmail.com>
|
||||||
|
Brian <mofosyne@gmail.com>
|
||||||
|
Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||||
|
CJ Pais <cj@cjpais.com>
|
||||||
|
CRD716 <crd716@gmail.com>
|
||||||
|
Cameron <csteele@steelecameron.com>
|
||||||
|
Cameron Kaiser <classilla@users.noreply.github.com>
|
||||||
|
Casey Primozic <casey@cprimozic.net>
|
||||||
|
Casey Primozic <me@ameo.link>
|
||||||
|
CausalLM <148736309+CausalLM@users.noreply.github.com>
|
||||||
|
Cebtenzzre <cebtenzzre@gmail.com>
|
||||||
|
Chad Brewbaker <crb002@gmail.com>
|
||||||
|
Cheng Shao <terrorjack@type.dance>
|
||||||
|
Chris Kuehl <ckuehl@ckuehl.me>
|
||||||
|
Christian Demsar <christian@github.email.demsar.us>
|
||||||
|
Christian Demsar <crasm@git.vczf.us>
|
||||||
|
Christian Falch <875252+chrfalch@users.noreply.github.com>
|
||||||
|
Christian Kögler <ck3d@gmx.de>
|
||||||
|
Clark Saben <76020733+csaben@users.noreply.github.com>
|
||||||
|
Clint Herron <hanclinto@gmail.com>
|
||||||
|
Cuong Trinh Manh <nguoithichkhampha@gmail.com>
|
||||||
|
DAN™ <dranger003@gmail.com>
|
||||||
|
Damian Stewart <d@damianstewart.com>
|
||||||
|
Dane Madsen <dane_madsen@hotmail.com>
|
||||||
|
DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
|
||||||
|
Daniel Bevenius <daniel.bevenius@gmail.com>
|
||||||
|
Daniel Drake <drake@endlessos.org>
|
||||||
|
Daniel Hiltgen <dhiltgen@users.noreply.github.com>
|
||||||
|
Daniel Illescas Romero <illescas.daniel@protonmail.com>
|
||||||
|
DannyDaemonic <DannyDaemonic@gmail.com>
|
||||||
|
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
|
||||||
|
Dave Della Costa <ddellacosta+github@gmail.com>
|
||||||
|
David Friehs <david@friehs.info>
|
||||||
|
David Kennedy <dakennedyd@gmail.com>
|
||||||
|
David Pflug <david@pflug.email>
|
||||||
|
David Renshaw <dwrenshaw@gmail.com>
|
||||||
|
David Sommers <12738+databyte@users.noreply.github.com>
|
||||||
|
David Yang <davidyang6us@gmail.com>
|
||||||
|
Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
|
||||||
|
Dean <Dean.Sinaean@gmail.com>
|
||||||
|
Deins <deinsegle@gmail.com>
|
||||||
|
Didzis Gosko <didzis@users.noreply.github.com>
|
||||||
|
Don Mahurin <dmahurin@users.noreply.github.com>
|
||||||
|
DooWoong Lee (David) <manics99@naver.com>
|
||||||
|
Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
|
||||||
|
Douglas Hanley <thesecretaryofwar@gmail.com>
|
||||||
|
Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
|
||||||
|
Ebey Abraham <ebey97@gmail.com>
|
||||||
|
Ed Lee <edilee@mozilla.com>
|
||||||
|
Ed Lepedus <ed.lepedus@googlemail.com>
|
||||||
|
Edward Taylor <edeetee@gmail.com>
|
||||||
|
Elbios <141279586+Elbios@users.noreply.github.com>
|
||||||
|
Engininja2 <139037756+Engininja2@users.noreply.github.com>
|
||||||
|
Equim <sayaka@ekyu.moe>
|
||||||
|
Eric Sommerlade <es0m@users.noreply.github.com>
|
||||||
|
Eric Zhang <34133756+EZForever@users.noreply.github.com>
|
||||||
|
Erik Garrison <erik.garrison@gmail.com>
|
||||||
|
Erik Scholz <Green-Sky@users.noreply.github.com>
|
||||||
|
Ettore Di Giacinto <mudler@users.noreply.github.com>
|
||||||
|
Evan Jones <evan.q.jones@gmail.com>
|
||||||
|
Evan Miller <emmiller@gmail.com>
|
||||||
|
Eve <139727413+netrunnereve@users.noreply.github.com>
|
||||||
|
Evgeny Kurnevsky <kurnevsky@gmail.com>
|
||||||
|
Ewout ter Hoeven <E.M.terHoeven@student.tudelft.nl>
|
||||||
|
ExtReMLapin <3909752+ExtReMLapin@users.noreply.github.com>
|
||||||
|
FK <sozforex@gmail.com>
|
||||||
|
Fabian <cmdrf@users.noreply.github.com>
|
||||||
|
Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
|
||||||
|
Faez Shakil <faez.shakil@gmail.com>
|
||||||
|
FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
|
||||||
|
Fattire <528174+fat-tire@users.noreply.github.com>
|
||||||
|
Felix <stenbackfelix@gmail.com>
|
||||||
|
Finn Voorhees <finnvoorhees@gmail.com>
|
||||||
|
Firat <firatkiral@gmail.com>
|
||||||
|
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
|
||||||
|
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
|
||||||
|
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
|
||||||
|
FrankHB <frankhb1989@gmail.com>
|
||||||
|
Frederik Vogel <Schaltfehler@users.noreply.github.com>
|
||||||
|
Gabe Goodhart <gabe.l.hart@gmail.com>
|
||||||
|
GainLee <perfecter.gen@gmail.com>
|
||||||
|
Galunid <karolek1231456@gmail.com>
|
||||||
|
Gary Linscott <glinscott@gmail.com>
|
||||||
|
Gary Mulder <gjmulder@gmail.com>
|
||||||
|
Genkagaku.GPT <hlhr202@163.com>
|
||||||
|
Georgi Gerganov <ggerganov@gmail.com>
|
||||||
|
Gilad S <giladgd@users.noreply.github.com>
|
||||||
|
GiviMAD <GiviMAD@users.noreply.github.com>
|
||||||
|
Govlzkoy <gotope@users.noreply.github.com>
|
||||||
|
Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
|
||||||
|
Guillaume Wenzek <gwenzek@users.noreply.github.com>
|
||||||
|
Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
|
||||||
|
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
|
||||||
|
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
|
||||||
|
Haohui Mai <ricetons@gmail.com>
|
||||||
|
Haoxiang Fei <tonyfettes@tonyfettes.com>
|
||||||
|
Harald Fernengel <harald.fernengel@here.com>
|
||||||
|
Hatsune Miku <129688334+at8u@users.noreply.github.com>
|
||||||
|
Henk Poley <HenkPoley@gmail.com>
|
||||||
|
Henri Vasserman <henv@hot.ee>
|
||||||
|
Henrik Forstén <henrik.forsten@gmail.com>
|
||||||
|
Herman Semenov <GermanAizek@yandex.ru>
|
||||||
|
Hesen Peng <hesen.peng@gmail.com>
|
||||||
|
Hoang Nguyen <hugo53@users.noreply.github.com>
|
||||||
|
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
|
||||||
|
Howard Su <howard0su@gmail.com>
|
||||||
|
Hua Jiang <allenhjiang@outlook.com>
|
||||||
|
Huawei Lin <huaweilin.cs@gmail.com>
|
||||||
|
Ian Bull <irbull@eclipsesource.com>
|
||||||
|
Ian Bull <irbull@gmail.com>
|
||||||
|
Ian Scrivener <github@zilogy.asia>
|
||||||
|
Ido S <ido.pluto@gmail.com>
|
||||||
|
IgnacioFDM <ignaciofdm@gmail.com>
|
||||||
|
Igor Okulist <okigan@gmail.com>
|
||||||
|
Ikko Eltociear Ashimine <eltociear@gmail.com>
|
||||||
|
Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
|
||||||
|
Ionoclast Laboratories <brigham@ionoclast.com>
|
||||||
|
Isaac McFadyen <isaac@imcf.me>
|
||||||
|
IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
|
||||||
|
Ivan Komarov <Ivan.Komarov@dfyz.info>
|
||||||
|
Ivan Stepanov <ivanstepanovftw@gmail.com>
|
||||||
|
JH23X <165871467+JH23X@users.noreply.github.com>
|
||||||
|
Jack Mousseau <jmousseau@users.noreply.github.com>
|
||||||
|
JackJollimore <130917767+JackJollimore@users.noreply.github.com>
|
||||||
|
Jag Chadha <jagtesh@gmail.com>
|
||||||
|
Jakub N <jakubniemczyk97@gmail.com>
|
||||||
|
James Reynolds <magnusviri@users.noreply.github.com>
|
||||||
|
Jan Boon <jan.boon@kaetemi.be>
|
||||||
|
Jan Boon <kaetemi@gmail.com>
|
||||||
|
Jan Ploski <jpl@plosquare.com>
|
||||||
|
Jannis Schönleber <joennlae@gmail.com>
|
||||||
|
Jared Van Bortel <cebtenzzre@gmail.com>
|
||||||
|
Jared Van Bortel <jared@nomic.ai>
|
||||||
|
Jason McCartney <jmac@theroot.org>
|
||||||
|
Jean-Christophe Hoelt <hoelt@fovea.cc>
|
||||||
|
Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
|
||||||
|
Jed Fox <git@jedfox.com>
|
||||||
|
Jeffrey Quesnelle <emozilla@nousresearch.com>
|
||||||
|
Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
|
||||||
|
Jhen-Jie Hong <iainst0409@gmail.com>
|
||||||
|
Jiahao Li <liplus17@163.com>
|
||||||
|
Jian Liao <jianliao@users.noreply.github.com>
|
||||||
|
JidongZhang-THU <1119708529@qq.com>
|
||||||
|
Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
|
||||||
|
Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
|
||||||
|
Johannes Gäßler <johannesg@5d6.de>
|
||||||
|
Johannes Rudolph <johannes.rudolph@gmail.com>
|
||||||
|
John <78893154+cmp-nct@users.noreply.github.com>
|
||||||
|
John Balis <phobossystems@gmail.com>
|
||||||
|
John Smith <67539080+kingsidelee@users.noreply.github.com>
|
||||||
|
JohnnyB <jboero@users.noreply.github.com>
|
||||||
|
Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com>
|
||||||
|
Jorge A <161275481+jorgealias@users.noreply.github.com>
|
||||||
|
Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com>
|
||||||
|
Joseph Stahl <1269177+josephst@users.noreply.github.com>
|
||||||
|
Joyce <joycebrum@google.com>
|
||||||
|
Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
|
||||||
|
Judd <foldl@users.noreply.github.com>
|
||||||
|
Julius Arkenberg <arki05@users.noreply.github.com>
|
||||||
|
Jun Jie <71215065+junnjiee16@users.noreply.github.com>
|
||||||
|
Juraj Bednar <juraj@bednar.io>
|
||||||
|
Justin Parker <jparkerweb@gmail.com>
|
||||||
|
Justin Suess <justin.suess@westpoint.edu>
|
||||||
|
Justine Tunney <jtunney@gmail.com>
|
||||||
|
Juuso Alasuutari <juuso.alasuutari@gmail.com>
|
||||||
|
KASR <karim.asrih@gmail.com>
|
||||||
|
Kamil Tomšík <info@tomsik.cz>
|
||||||
|
Karsten Weiss <knweiss@gmail.com>
|
||||||
|
Karthick <j.karthic2004@gmail.com>
|
||||||
|
Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com>
|
||||||
|
Karthik Sethuraman <k.seth1993@gmail.com>
|
||||||
|
Kasumi <90275229+kasumi-1@users.noreply.github.com>
|
||||||
|
Kawrakow <48489457+ikawrakow@users.noreply.github.com>
|
||||||
|
Keiichi Tabata <keiichi.tabata@outlook.com>
|
||||||
|
Kenvix ⭐ <kenvixzure@live.com>
|
||||||
|
Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
|
||||||
|
Kevin Ji <1146876+kevinji@users.noreply.github.com>
|
||||||
|
Kevin Kwok <antimatter15@gmail.com>
|
||||||
|
Kevin Lo <kevlo@kevlo.org>
|
||||||
|
Kolen Cheung <ickc@users.noreply.github.com>
|
||||||
|
Konstantin Herud <konstantin.herud@denkbares.com>
|
||||||
|
Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
|
||||||
|
Kunshang Ji <kunshang.ji@intel.com>
|
||||||
|
Kyle Liang <liangmanlai@gmail.com>
|
||||||
|
Kyle Mistele <kyle@mistele.com>
|
||||||
|
Kylin <56434533+KyL0N@users.noreply.github.com>
|
||||||
|
Lars Grammel <lars.grammel@gmail.com>
|
||||||
|
Laura <Tijntje_7@msn.com>
|
||||||
|
Lee <44310445+lx200916@users.noreply.github.com>
|
||||||
|
Lee Drake <b.lee.drake@gmail.com>
|
||||||
|
Leng Yue <lengyue@lengyue.me>
|
||||||
|
LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
|
||||||
|
Leonardo Neumann <leonardo@neumann.dev.br>
|
||||||
|
Li Tan <tanliboy@gmail.com>
|
||||||
|
Linwei Wang <wanix1988@gmail.com>
|
||||||
|
LoganDark <github@logandark.mozmail.com>
|
||||||
|
LostRuins <39025047+LostRuins@users.noreply.github.com>
|
||||||
|
Luciano <lucianostrika44@gmail.com>
|
||||||
|
Luo Tian <lt@basecity.com>
|
||||||
|
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
|
||||||
|
Maarten ter Huurne <maarten@treewalker.org>
|
||||||
|
Mack Straight <eiz@users.noreply.github.com>
|
||||||
|
Maël Kerbiriou <m431.kerbiriou@gmail.com>
|
||||||
|
MaggotHATE <clay1326@gmail.com>
|
||||||
|
Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
|
||||||
|
Marco Matthies <71844+marcom@users.noreply.github.com>
|
||||||
|
Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
|
||||||
|
Marian Cepok <marian.cepok@gmail.com>
|
||||||
|
Mark Fairbairn <thebaron88@gmail.com>
|
||||||
|
Marko Tasic <mtasic85@gmail.com>
|
||||||
|
Martin Krasser <krasserm@googlemail.com>
|
||||||
|
Martin Schwaighofer <mschwaig@users.noreply.github.com>
|
||||||
|
Marvin Gießing <marvin.giessing@gmail.com>
|
||||||
|
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
|
||||||
|
Matheus C. França <matheus-catarino@hotmail.com>
|
||||||
|
Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
|
||||||
|
Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
|
||||||
|
Mathijs de Bruin <mathijs@mathijsfietst.nl>
|
||||||
|
Matt Clayton <156335168+mattjcly@users.noreply.github.com>
|
||||||
|
Matt Pulver <matt.pulver@heavy.ai>
|
||||||
|
Matteo Boschini <12133566+mbosc@users.noreply.github.com>
|
||||||
|
Matthew Tejo <matthew.tejo@gmail.com>
|
||||||
|
Matvey Soloviev <blackhole89@gmail.com>
|
||||||
|
Maxime <672982+maximegmd@users.noreply.github.com>
|
||||||
|
Maximilian Winter <maximilian.winter.91@gmail.com>
|
||||||
|
Meng Zhang <meng@tabbyml.com>
|
||||||
|
Meng, Hengyu <hengyu.meng@intel.com>
|
||||||
|
Merrick Christensen <merrick.christensen@gmail.com>
|
||||||
|
Michael Coppola <m18coppola@gmail.com>
|
||||||
|
Michael Hueschen <m@mhueschen.dev>
|
||||||
|
Michael Kesper <mkesper@schokokeks.org>
|
||||||
|
Michael Klimenko <mklimenko29@gmail.com>
|
||||||
|
Michael Podvitskiy <podvitskiymichael@gmail.com>
|
||||||
|
Michael Potter <NanoTekGuy@Gmail.com>
|
||||||
|
Michaël de Vries <vriesdemichael@gmail.com>
|
||||||
|
Mihai <mihai.chirculescu@yahoo.com>
|
||||||
|
Mike <ytianhui2004@gmail.com>
|
||||||
|
Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
|
||||||
|
Mirko185 <mirkosig@gmail.com>
|
||||||
|
Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
|
||||||
|
Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
|
||||||
|
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
|
||||||
|
Murilo Santana <mvrilo@gmail.com>
|
||||||
|
Musab Gultekin <musabgultekin@users.noreply.github.com>
|
||||||
|
Nam D. Tran <42194884+namtranase@users.noreply.github.com>
|
||||||
|
NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
|
||||||
|
Nebula <infinitewormhole@gmail.com>
|
||||||
|
Neo Zhang Jianyu <jianyu.zhang@intel.com>
|
||||||
|
Neuman Vong <neuman.vong@gmail.com>
|
||||||
|
Nexesenex <124105151+Nexesenex@users.noreply.github.com>
|
||||||
|
Niall Coates <1349685+Niall-@users.noreply.github.com>
|
||||||
|
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
|
||||||
|
Nigel Bosch <pnigelb@gmail.com>
|
||||||
|
Niklas Korz <niklas@niklaskorz.de>
|
||||||
|
Nindaleth <Nindaleth@users.noreply.github.com>
|
||||||
|
Oleksandr Nikitin <oleksandr@tvori.info>
|
||||||
|
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
|
||||||
|
Olivier Chafik <ochafik@users.noreply.github.com>
|
||||||
|
Ondřej Čertík <ondrej@certik.us>
|
||||||
|
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
|
||||||
|
Paul Tsochantaris <ptsochantaris@icloud.com>
|
||||||
|
Pavol Rusnak <pavol@rusnak.io>
|
||||||
|
Pedro Cuenca <pedro@huggingface.co>
|
||||||
|
Peter Sugihara <peter@campsh.com>
|
||||||
|
Phil H <5756783+phiharri@users.noreply.github.com>
|
||||||
|
Philip Taron <philip.taron@gmail.com>
|
||||||
|
Phillip Kravtsov <phillip@kravtsov.net>
|
||||||
|
Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
|
||||||
|
Pierrick Hymbert <pierrick.hymbert@gmail.com>
|
||||||
|
Przemysław Pawełczyk <przemoc@gmail.com>
|
||||||
|
Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
|
||||||
|
Qingyou Meng <meng.qingyou@gmail.com>
|
||||||
|
Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
|
||||||
|
RJ Adriaansen <adriaansen@eshcc.eur.nl>
|
||||||
|
Radoslav Gerganov <rgerganov@gmail.com>
|
||||||
|
Radosław Gryta <radek.gryta@gmail.com>
|
||||||
|
Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com>
|
||||||
|
Rand Xie <randxiexyy29@gmail.com>
|
||||||
|
Randall Fitzgerald <randall@dasaku.net>
|
||||||
|
Reinforce-II <fate@eastal.com>
|
||||||
|
Riceball LEE <snowyu.lee@gmail.com>
|
||||||
|
Richard Kiss <him@richardkiss.com>
|
||||||
|
Richard Roberson <richardr1126@gmail.com>
|
||||||
|
Rick G <26732651+TheFlipbook@users.noreply.github.com>
|
||||||
|
Rickard Edén <rickardeden@gmail.com>
|
||||||
|
Rickard Hallerbäck <rickard.hallerback@gmail.com>
|
||||||
|
Rickey Bowers Jr <bitRAKE@gmail.com>
|
||||||
|
Riley Stewart <ristew@users.noreply.github.com>
|
||||||
|
Rinne <AsakusaRinne@gmail.com>
|
||||||
|
Rinne <liu_yaohui1998@126.com>
|
||||||
|
Robert Brisita <986796+rbrisita@users.noreply.github.com>
|
||||||
|
Robert Sung-wook Shin <edp1096@users.noreply.github.com>
|
||||||
|
Robey Holderith <robey@flaminglunchbox.net>
|
||||||
|
Robyn <robyngraf@users.noreply.github.com>
|
||||||
|
Roger Meier <r.meier@siemens.com>
|
||||||
|
Roland <14355895+rbur0425@users.noreply.github.com>
|
||||||
|
Romain D <90720+Artefact2@users.noreply.github.com>
|
||||||
|
Romain Neutron <romain@neutron.io>
|
||||||
|
Roman Parykin <donderom@gmail.com>
|
||||||
|
Ron Evans <ron@hybridgroup.com>
|
||||||
|
Ron Jailall <rojailal@gmail.com>
|
||||||
|
Ronny Brendel <ronnybrendel@gmail.com>
|
||||||
|
Ronsor <ronsor@ronsor.pw>
|
||||||
|
Rowan Hart <rowanbhart@gmail.com>
|
||||||
|
Rune <43761327+Rune-AI@users.noreply.github.com>
|
||||||
|
Ryan Landay <rlanday@gmail.com>
|
||||||
|
Ryder Wishart <ryderwishart@gmail.com>
|
||||||
|
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
|
||||||
|
SakuraUmi <yukinon244@gmail.com>
|
||||||
|
Salvador E. Tropea <stropea@inti.gob.ar>
|
||||||
|
Sam Spilsbury <smspillaz@gmail.com>
|
||||||
|
Sami Farin <3876865+Safari77@users.noreply.github.com>
|
||||||
|
Samuel Maynard <samwmaynard@gmail.com>
|
||||||
|
Sang-Kil Park <sang.park@42dot.ai>
|
||||||
|
Seb C <47074056+Sebby37@users.noreply.github.com>
|
||||||
|
Sebastián A <sebastian.aedo29@gmail.com>
|
||||||
|
SebastianApel <13675545+SebastianApel@users.noreply.github.com>
|
||||||
|
Senemu <10880819+Senemu@users.noreply.github.com>
|
||||||
|
Sergey Alirzaev <zl29ah@gmail.com>
|
||||||
|
Sergio López <slp@sinrega.org>
|
||||||
|
SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
|
||||||
|
ShadovvBeast <ShadovvBeast@gmail.com>
|
||||||
|
Shakhar Dasgupta <shakhardasgupta@gmail.com>
|
||||||
|
Shangning Xu <32517059+xushangning@users.noreply.github.com>
|
||||||
|
Shijie <821898965@qq.com>
|
||||||
|
Shintarou Okada <kokuzen@gmail.com>
|
||||||
|
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
|
||||||
|
Shouzheng Liu <lshzh.hi@gmail.com>
|
||||||
|
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
|
||||||
|
Simon Willison <swillison@gmail.com>
|
||||||
|
Siwen Yu <yusiwen@gmail.com>
|
||||||
|
Sky Yan <skyan83@gmail.com>
|
||||||
|
Slaren <2141330+slaren@users.noreply.github.com>
|
||||||
|
Slava Primenko <primenko.s@gmail.com>
|
||||||
|
SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
|
||||||
|
Someone <sergei.kozlukov@aalto.fi>
|
||||||
|
Someone Serge <sergei.kozlukov@aalto.fi>
|
||||||
|
Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
|
||||||
|
Spencer Sutton <spencersutton@users.noreply.github.com>
|
||||||
|
Srinivas Billa <nivibilla@gmail.com>
|
||||||
|
Stefan Sydow <stefan@sydow.email>
|
||||||
|
Stephan Walter <stephan@walter.name>
|
||||||
|
Stephen Nichols <snichols@users.noreply.github.com>
|
||||||
|
Steve Grubb <ausearch.1@gmail.com>
|
||||||
|
Steven Roussey <sroussey@gmail.com>
|
||||||
|
Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
|
||||||
|
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
|
||||||
|
SuperUserNameMan <yoann@terminajones.com>
|
||||||
|
Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
|
||||||
|
Taikono-Himazin <kazu@po.harenet.ne.jp>
|
||||||
|
Tameem <113388789+AhmadTameem@users.noreply.github.com>
|
||||||
|
Tamotsu Takahashi <ttakah+github@gmail.com>
|
||||||
|
Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com>
|
||||||
|
Thatcher Chamberlin <j.thatcher.c@gmail.com>
|
||||||
|
Theia Vogel <theia@vgel.me>
|
||||||
|
Thérence <13496987+Royalphax@users.noreply.github.com>
|
||||||
|
Thibault Terrasson <thibault.terrasson@gmail.com>
|
||||||
|
Thomas Klausner <wiz@gatalith.at>
|
||||||
|
Tim Miller <drasticactions@users.noreply.github.com>
|
||||||
|
Timmy Knight <r2d2fish@gmail.com>
|
||||||
|
Timothy Cronin <40186632+4imothy@users.noreply.github.com>
|
||||||
|
Ting Lou <ting.lou@gmail.com>
|
||||||
|
Ting Sun <suntcrick@gmail.com>
|
||||||
|
Tobias Lütke <tobi@shopify.com>
|
||||||
|
Tom C <tom.corelis@gmail.com>
|
||||||
|
Tom Jobbins <784313+TheBloke@users.noreply.github.com>
|
||||||
|
Tomas <tom.tomas.36478119@gmail.com>
|
||||||
|
Tomáš Pazdiora <tomas.pazdiora@gmail.com>
|
||||||
|
Tristan Ross <rosscomputerguy@protonmail.com>
|
||||||
|
Tungsten842 <886724vf@anonaddy.me>
|
||||||
|
Tungsten842 <quantmint@protonmail.com>
|
||||||
|
Tushar <ditsuke@protonmail.com>
|
||||||
|
UEXTM.com <84163508+uextm@users.noreply.github.com>
|
||||||
|
Uzo Nweke <uzoechi@gmail.com>
|
||||||
|
Vaibhav Srivastav <vaibhavs10@gmail.com>
|
||||||
|
Val Kharitonov <mail@kharvd.com>
|
||||||
|
Valentin Konovalov <valle.ketsujin@gmail.com>
|
||||||
|
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
|
||||||
|
Victor Z. Peng <ziliangdotme@gmail.com>
|
||||||
|
Vlad <spitfireage@gmail.com>
|
||||||
|
Vladimir <bogdad@gmail.com>
|
||||||
|
Vladimir Malyutin <first-leon@yandex.ru>
|
||||||
|
Vladimir Zorin <vladimir@deviant.guru>
|
||||||
|
Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
|
||||||
|
WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
|
||||||
|
Weird Constructor <weirdconstructor@gmail.com>
|
||||||
|
Welby Seely <welbyseely@gmail.com>
|
||||||
|
Wentai Zhang <rchardx@gmail.com>
|
||||||
|
WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
|
||||||
|
Willy Tarreau <w@1wt.eu>
|
||||||
|
Wu Jian Ping <wujjpp@hotmail.com>
|
||||||
|
Wu Jian Ping <wujp@greatld.com>
|
||||||
|
Xiake Sun <xiake.sun@intel.com>
|
||||||
|
Xiang (Kevin) Li <kevinli020508@gmail.com>
|
||||||
|
Xiao-Yong Jin <jinxiaoyong@gmail.com>
|
||||||
|
XiaotaoChen <chenxiaotao1234@gmail.com>
|
||||||
|
Xiaoyi Chen <cxychina@gmail.com>
|
||||||
|
Xingchen Song(宋星辰) <xingchensong1996@163.com>
|
||||||
|
Xuan Son Nguyen <thichthat@gmail.com>
|
||||||
|
Yann Follet <131855179+YannFollet@users.noreply.github.com>
|
||||||
|
Yiming Cui <conandiy@vip.qq.com>
|
||||||
|
Yishuo Wang <MeouSker77@outlook.com>
|
||||||
|
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
|
||||||
|
Yui <dev@sleepyyui.com>
|
||||||
|
Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
|
||||||
|
Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
|
||||||
|
ZHAOKAI WANG <sanxianwei@163.com>
|
||||||
|
Zane Shannon <z@zcs.me>
|
||||||
|
Zay <95888118+isaiahbjork@users.noreply.github.com>
|
||||||
|
Zenix <zenixls2@gmail.com>
|
||||||
|
Zhang Peiyuan <a1286225768@gmail.com>
|
||||||
|
ZhouYuChen <zhouyuchen@naver.com>
|
||||||
|
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
|
||||||
|
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
|
||||||
|
Zsapi <martin1.zsapka@gmail.com>
|
||||||
|
a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
|
||||||
|
adel boussaken <netdur@gmail.com>
|
||||||
|
afrideva <95653597+afrideva@users.noreply.github.com>
|
||||||
|
akawrykow <142945436+akawrykow@users.noreply.github.com>
|
||||||
|
alexpinel <93524949+alexpinel@users.noreply.github.com>
|
||||||
|
alonfaraj <alonfaraj@gmail.com>
|
||||||
|
andrijdavid <david@geek.mg>
|
||||||
|
anon998 <131767832+anon998@users.noreply.github.com>
|
||||||
|
anzz1 <anzz1@live.com>
|
||||||
|
apaz <aarpazdera@gmail.com>
|
||||||
|
apcameron <37645737+apcameron@users.noreply.github.com>
|
||||||
|
arcrank <arcrank@gmail.com>
|
||||||
|
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
|
||||||
|
at8u <129688334+at8u@users.noreply.github.com>
|
||||||
|
automaticcat <daogiatuank54@gmail.com>
|
||||||
|
bandoti <141645996+bandoti@users.noreply.github.com>
|
||||||
|
beiller <beiller@gmail.com>
|
||||||
|
bhubbb <79117352+bhubbb@users.noreply.github.com>
|
||||||
|
bmwl <brian.marshall@tolko.com>
|
||||||
|
bobqianic <129547291+bobqianic@users.noreply.github.com>
|
||||||
|
bryanSwk <93190252+bryanSwk@users.noreply.github.com>
|
||||||
|
bsilvereagle <bsilvereagle@users.noreply.github.com>
|
||||||
|
bssrdf <merlintiger@hotmail.com>
|
||||||
|
byte-6174 <88070277+byte-6174@users.noreply.github.com>
|
||||||
|
cebtenzzre <cebtenzzre@gmail.com>
|
||||||
|
chaihahaha <chai836275709@gmail.com>
|
||||||
|
chiranko <96988916+chiranko@users.noreply.github.com>
|
||||||
|
clibdev <52199778+clibdev@users.noreply.github.com>
|
||||||
|
clyang <clyang@clyang.net>
|
||||||
|
cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
|
||||||
|
coezbek <c.oezbek@gmail.com>
|
||||||
|
comex <comexk@gmail.com>
|
||||||
|
compilade <113953597+compilade@users.noreply.github.com>
|
||||||
|
crasm <crasm@git.vczf.net>
|
||||||
|
crasm <crasm@git.vczf.us>
|
||||||
|
daboe01 <daboe01@googlemail.com>
|
||||||
|
david raistrick <keen99@users.noreply.github.com>
|
||||||
|
ddpasa <112642920+ddpasa@users.noreply.github.com>
|
||||||
|
deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
|
||||||
|
divinity76 <divinity76@gmail.com>
|
||||||
|
dotpy314 <33351922+dotpy314@users.noreply.github.com>
|
||||||
|
drbh <david.richard.holtz@gmail.com>
|
||||||
|
ds5t5 <145942675+ds5t5@users.noreply.github.com>
|
||||||
|
dylan <canardleteer@users.noreply.github.com>
|
||||||
|
eastriver <lee@eastriver.dev>
|
||||||
|
ebraminio <ebraminio@gmail.com>
|
||||||
|
eiery <19350831+eiery@users.noreply.github.com>
|
||||||
|
eric8607242 <e0928021388@gmail.com>
|
||||||
|
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
|
||||||
|
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
|
||||||
|
gliptic <gliptic@users.noreply.github.com>
|
||||||
|
goerch <jhr.walter@t-online.de>
|
||||||
|
grahameth <96447521+grahameth@users.noreply.github.com>
|
||||||
|
gwjr <502526+gwjr@users.noreply.github.com>
|
||||||
|
h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
|
||||||
|
hankcs <cnhankmc@gmail.com>
|
||||||
|
hoangmit <hoangmit@users.noreply.github.com>
|
||||||
|
hongbo.mo <352280764@qq.com>
|
||||||
|
howlger <eclipse@voormann.de>
|
||||||
|
howlger <github@voormann.de>
|
||||||
|
hutli <6594598+hutli@users.noreply.github.com>
|
||||||
|
hutli <hutli@hutli.hu>
|
||||||
|
hutli <jensstaermose@hotmail.com>
|
||||||
|
hxer7963 <hxer7963@gmail.com>
|
||||||
|
hydai <z54981220@gmail.com>
|
||||||
|
iSma <ismail.senhaji@gmail.com>
|
||||||
|
iacore <74560659+iacore@users.noreply.github.com>
|
||||||
|
igarnier <igarnier@protonmail.com>
|
||||||
|
iohub <rickyang.pro@gmail.com>
|
||||||
|
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
|
||||||
|
jameswu2014 <545426914@qq.com>
|
||||||
|
jneem <joeneeman@gmail.com>
|
||||||
|
johnson442 <56517414+johnson442@users.noreply.github.com>
|
||||||
|
jon-chuang <9093549+jon-chuang@users.noreply.github.com>
|
||||||
|
jp-x-g <jpxg-dev@protonmail.com>
|
||||||
|
jwj7140 <32943891+jwj7140@users.noreply.github.com>
|
||||||
|
kaizau <kaizau@users.noreply.github.com>
|
||||||
|
kalomaze <66376113+kalomaze@users.noreply.github.com>
|
||||||
|
kang <tpdns9032100@gmail.com>
|
||||||
|
katsu560 <118887472+katsu560@users.noreply.github.com>
|
||||||
|
kchro3 <62481661+kchro3@users.noreply.github.com>
|
||||||
|
khimaros <me@khimaros.com>
|
||||||
|
kiltyj <kiltyj@gmail.com>
|
||||||
|
klosax <131523366+klosax@users.noreply.github.com>
|
||||||
|
kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
|
||||||
|
kunnis <kunnis@users.noreply.github.com>
|
||||||
|
kuronekosaiko <EvanChanJ@163.com>
|
||||||
|
kuvaus <22169537+kuvaus@users.noreply.github.com>
|
||||||
|
kwin1412 <42286931+kwin1412@users.noreply.github.com>
|
||||||
|
l3utterfly <gc.pthzfoldr@gmail.com>
|
||||||
|
ldwang <ftgreat@163.com>
|
||||||
|
le.chang <cljs118@126.com>
|
||||||
|
leejet <leejet714@gmail.com>
|
||||||
|
limitedAtonement <limitedAtonement@users.noreply.github.com>
|
||||||
|
lon <114724657+longregen@users.noreply.github.com>
|
||||||
|
m3ndax <adrian.goessl@outlook.com>
|
||||||
|
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
|
||||||
|
makomk <makosoft@googlemail.com>
|
||||||
|
manikbhandari <mbbhandarimanik2@gmail.com>
|
||||||
|
mdrokz <mohammadmunshi@gmail.com>
|
||||||
|
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
|
||||||
|
minarchist <minarchist@users.noreply.github.com>
|
||||||
|
mj-shifu <77107165+mj-shifu@users.noreply.github.com>
|
||||||
|
mmyjona <jonathan.gonse@gmail.com>
|
||||||
|
momonga <115213907+mmnga@users.noreply.github.com>
|
||||||
|
moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
|
||||||
|
mzcu <milos.cubrilo@gmail.com>
|
||||||
|
nanahi <130121847+na-na-hi@users.noreply.github.com>
|
||||||
|
ngc92 <7938269+ngc92@users.noreply.github.com>
|
||||||
|
nhamanasu <45545786+nhamanasu@users.noreply.github.com>
|
||||||
|
niansa/tuxifan <anton-sa@web.de>
|
||||||
|
niansa/tuxifan <tuxifan@posteo.de>
|
||||||
|
ningshanwutuobang <ningshanwutuobang@gmail.com>
|
||||||
|
nold <Nold360@users.noreply.github.com>
|
||||||
|
nopperl <54780682+nopperl@users.noreply.github.com>
|
||||||
|
nusu-github <29514220+nusu-github@users.noreply.github.com>
|
||||||
|
olexiyb <olexiyb@gmail.com>
|
||||||
|
oobabooga <112222186+oobabooga@users.noreply.github.com>
|
||||||
|
opparco <parco.opaai@gmail.com>
|
||||||
|
ostix360 <55257054+ostix360@users.noreply.github.com>
|
||||||
|
perserk <perserk@gmail.com>
|
||||||
|
postmasters <namnguyen@google.com>
|
||||||
|
pudepiedj <pudepiedj@gmail.com>
|
||||||
|
qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
|
||||||
|
qouoq <qouoq@fastmail.com>
|
||||||
|
qunash <anzoria@gmail.com>
|
||||||
|
rabidcopy <rabidcopy@yahoo.com>
|
||||||
|
rankaiyx <rankaiyx@rankaiyx.com>
|
||||||
|
rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com>
|
||||||
|
rhuddleston <ryan.huddleston@percona.com>
|
||||||
|
rimoliga <53384203+rimoliga@users.noreply.github.com>
|
||||||
|
runfuture <runfuture@users.noreply.github.com>
|
||||||
|
sandyiscool <sandyiscool@gmail.com>
|
||||||
|
semidark <me@semidark.net>
|
||||||
|
sharpHL <132747147+sharpHL@users.noreply.github.com>
|
||||||
|
shibe2 <shibe@tuta.io>
|
||||||
|
singularity <12184989+singularity-s0@users.noreply.github.com>
|
||||||
|
sjinzh <sjinzh@gmail.com>
|
||||||
|
slaren <2141330+slaren@users.noreply.github.com>
|
||||||
|
slaren <slarengh@gmail.com>
|
||||||
|
snadampal <87143774+snadampal@users.noreply.github.com>
|
||||||
|
staviq <staviq@gmail.com>
|
||||||
|
stduhpf <stephduh@live.fr>
|
||||||
|
swittk <switt1995@gmail.com>
|
||||||
|
takov751 <40316768+takov751@users.noreply.github.com>
|
||||||
|
tarcey <cey.tarik@gmail.com>
|
||||||
|
texmex76 <40733439+texmex76@users.noreply.github.com>
|
||||||
|
thement <40525767+thement@users.noreply.github.com>
|
||||||
|
tjohnman <tjohnman@users.noreply.github.com>
|
||||||
|
tslmy <tslmy@users.noreply.github.com>
|
||||||
|
ubik2 <ubik2@users.noreply.github.com>
|
||||||
|
uint256_t <konndennsa@gmail.com>
|
||||||
|
uint256_t <maekawatoshiki1017@gmail.com>
|
||||||
|
unbounded <haakon@likedan.net>
|
||||||
|
valiray <133289098+valiray@users.noreply.github.com>
|
||||||
|
vodkaslime <646329483@qq.com>
|
||||||
|
vvhg1 <94630311+vvhg1@users.noreply.github.com>
|
||||||
|
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
|
||||||
|
wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
|
||||||
|
whoreson <139810751+whoreson@users.noreply.github.com>
|
||||||
|
wonjun Jang <strutive07@gmail.com>
|
||||||
|
wzy <32936898+Freed-Wu@users.noreply.github.com>
|
||||||
|
xaedes <xaedes@gmail.com>
|
||||||
|
xaedes <xaedes@googlemail.com>
|
||||||
|
xloem <0xloem@gmail.com>
|
||||||
|
yangli2 <yangli2@gmail.com>
|
||||||
|
yuiseki <yuiseki@gmail.com>
|
||||||
|
zakkor <edward.partenie@gmail.com>
|
||||||
|
zhouwg <6889919+zhouwg@users.noreply.github.com>
|
||||||
|
zrm <trustiosity.zrm@gmail.com>
|
||||||
|
源文雨 <41315874+fumiama@users.noreply.github.com>
|
||||||
|
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
|
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
||||||
MIT License
|
MIT License
|
||||||
|
|
||||||
Copyright (c) 2023 Georgi Gerganov
|
Copyright (c) 2023-2024 The ggml authors
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
|
8
Makefile
8
Makefile
|
@ -1,7 +1,7 @@
|
||||||
# Define the default target now so that it is always the first target
|
# Define the default target now so that it is always the first target
|
||||||
BUILD_TARGETS = \
|
BUILD_TARGETS = \
|
||||||
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||||
simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
|
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
|
||||||
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
|
@ -646,7 +646,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])'
|
||||||
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
||||||
ifndef CUDA_DOCKER_ARCH
|
ifndef CUDA_DOCKER_ARCH
|
||||||
ifndef CUDA_POWER_ARCH
|
ifndef CUDA_POWER_ARCH
|
||||||
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
|
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
|
||||||
endif # CUDA_POWER_ARCH
|
endif # CUDA_POWER_ARCH
|
||||||
endif # CUDA_DOCKER_ARCH
|
endif # CUDA_DOCKER_ARCH
|
||||||
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
||||||
|
@ -800,6 +800,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
|
@ -122,6 +122,8 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
||||||
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
||||||
|
|
||||||
|
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
|
||||||
|
|
||||||
**Multimodal models:**
|
**Multimodal models:**
|
||||||
|
|
||||||
- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
|
- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
|
||||||
|
@ -185,7 +187,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
- [Dot](https://github.com/alexpinel/Dot) (GPL)
|
- [Dot](https://github.com/alexpinel/Dot) (GPL)
|
||||||
- [MindMac](https://mindmac.app) (proprietary)
|
- [MindMac](https://mindmac.app) (proprietary)
|
||||||
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
||||||
|
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
||||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||||
|
|
||||||
---
|
---
|
||||||
|
@ -496,7 +498,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
This provides BLAS acceleration on HIP-supported AMD GPUs.
|
This provides BLAS acceleration on HIP-supported AMD GPUs.
|
||||||
Make sure to have ROCm installed.
|
Make sure to have ROCm installed.
|
||||||
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
|
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
|
||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
```bash
|
```bash
|
||||||
|
@ -513,7 +515,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
||||||
```bash
|
```bash
|
||||||
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030
|
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
||||||
|
|
|
@ -1745,6 +1745,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
||||||
cparams.pooling_type = params.pooling_type;
|
cparams.pooling_type = params.pooling_type;
|
||||||
cparams.defrag_thold = params.defrag_thold;
|
cparams.defrag_thold = params.defrag_thold;
|
||||||
|
cparams.cb_eval = params.cb_eval;
|
||||||
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
||||||
cparams.offload_kqv = !params.no_kv_offload;
|
cparams.offload_kqv = !params.no_kv_offload;
|
||||||
|
|
||||||
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
||||||
|
@ -2192,7 +2194,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
if (params.warmup) {
|
||||||
LOG("warming up the model with an empty run\n");
|
LOG("warming up the model with an empty run\n");
|
||||||
|
|
||||||
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
||||||
|
@ -2212,23 +2214,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> llama_tokenize(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_bos,
|
bool add_special,
|
||||||
bool special) {
|
bool parse_special) {
|
||||||
return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
|
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> llama_tokenize(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_bos,
|
bool add_special,
|
||||||
bool special) {
|
bool parse_special) {
|
||||||
// upper limit for the number of tokens
|
// upper limit for the number of tokens
|
||||||
int n_tokens = text.length() + add_bos;
|
int n_tokens = text.length() + 2 * add_special;
|
||||||
std::vector<llama_token> result(n_tokens);
|
std::vector<llama_token> result(n_tokens);
|
||||||
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
|
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||||
if (n_tokens < 0) {
|
if (n_tokens < 0) {
|
||||||
result.resize(-n_tokens);
|
result.resize(-n_tokens);
|
||||||
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
|
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||||
GGML_ASSERT(check == -n_tokens);
|
GGML_ASSERT(check == -n_tokens);
|
||||||
} else {
|
} else {
|
||||||
result.resize(n_tokens);
|
result.resize(n_tokens);
|
||||||
|
|
|
@ -80,6 +80,9 @@ struct gpt_params {
|
||||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||||
|
|
||||||
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||||
|
void * cb_eval_user_data = nullptr;
|
||||||
|
|
||||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||||
|
|
||||||
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||||
|
@ -156,6 +159,7 @@ struct gpt_params {
|
||||||
bool infill = false; // use infill mode
|
bool infill = false; // use infill mode
|
||||||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||||
bool no_kv_offload = false; // disable KV offloading
|
bool no_kv_offload = false; // disable KV offloading
|
||||||
|
bool warmup = true; // warmup run
|
||||||
|
|
||||||
std::string cache_type_k = "f16"; // KV cache data type for the K
|
std::string cache_type_k = "f16"; // KV cache data type for the K
|
||||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||||
|
@ -223,14 +227,14 @@ void llama_batch_add(
|
||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> llama_tokenize(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_bos,
|
bool add_special,
|
||||||
bool special = false);
|
bool parse_special = false);
|
||||||
|
|
||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> llama_tokenize(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_bos,
|
bool add_special,
|
||||||
bool special = false);
|
bool parse_special = false);
|
||||||
|
|
||||||
// tokenizes a token into a piece
|
// tokenizes a token into a piece
|
||||||
// should work similar to Python's `tokenizer.id_to_piece`
|
// should work similar to Python's `tokenizer.id_to_piece`
|
||||||
|
|
|
@ -160,7 +160,7 @@ class Model(ABC):
|
||||||
data = data.astype(np.float32)
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
||||||
data = data.astype(np.float32)
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
@ -227,15 +227,14 @@ class Model(ABC):
|
||||||
return ("pytorch_model.bin",)
|
return ("pytorch_model.bin",)
|
||||||
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
|
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
|
||||||
|
|
||||||
def _set_vocab_gpt2(self):
|
# used for GPT-2 BPE and WordPiece vocabs
|
||||||
dir_model = self.dir_model
|
def get_basic_vocab(self) -> tuple[list[str], list[int]]:
|
||||||
hparams = self.hparams
|
|
||||||
tokens: list[str] = []
|
tokens: list[str] = []
|
||||||
toktypes: list[int] = []
|
toktypes: list[int] = []
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
|
||||||
assert max(tokenizer.vocab.values()) < vocab_size
|
assert max(tokenizer.vocab.values()) < vocab_size
|
||||||
|
|
||||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
||||||
|
@ -255,11 +254,15 @@ class Model(ABC):
|
||||||
tokens.append(reverse_vocab[i])
|
tokens.append(reverse_vocab[i])
|
||||||
toktypes.append(gguf.TokenType.NORMAL)
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
|
||||||
|
return tokens, toktypes
|
||||||
|
|
||||||
|
def _set_vocab_gpt2(self) -> None:
|
||||||
|
tokens, toktypes = self.get_basic_vocab()
|
||||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||||
self.gguf_writer.add_token_list(tokens)
|
self.gguf_writer.add_token_list(tokens)
|
||||||
self.gguf_writer.add_token_types(toktypes)
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def _set_vocab_qwen(self):
|
def _set_vocab_qwen(self):
|
||||||
|
@ -2043,34 +2046,25 @@ class BertModel(Model):
|
||||||
self.gguf_writer.add_pooling_type(pooling_type)
|
self.gguf_writer.add_pooling_type(pooling_type)
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
# use huggingface vocab to get all tokens
|
tokens, toktypes = self.get_basic_vocab()
|
||||||
vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
|
self.vocab_size = len(tokens)
|
||||||
tokens, scores, toktypes = zip(*vocab.all_tokens())
|
|
||||||
assert len(tokens) == vocab.vocab_size
|
|
||||||
self.vocab_size = vocab.vocab_size
|
|
||||||
|
|
||||||
# we need this to validate the size of the token_type embeddings
|
# we need this to validate the size of the token_type embeddings
|
||||||
# though currently we are passing all zeros to the token_type embeddings
|
# though currently we are passing all zeros to the token_type embeddings
|
||||||
n_token_types = len(set(toktypes))
|
self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B"
|
||||||
self.gguf_writer.add_token_type_count(n_token_types)
|
|
||||||
|
|
||||||
# convert to phantom space vocab
|
# convert to phantom space vocab
|
||||||
def phantom(tok, typ):
|
def phantom(tok):
|
||||||
if tok.startswith(b"[") and tok.endswith(b"]"):
|
if tok.startswith("[") and tok.endswith("]"):
|
||||||
return tok
|
return tok
|
||||||
if tok.startswith(b"##"):
|
if tok.startswith("##"):
|
||||||
return tok[2:]
|
return tok[2:]
|
||||||
return b"\xe2\x96\x81" + tok
|
return "\u2581" + tok
|
||||||
tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
|
tokens = list(map(phantom, tokens))
|
||||||
|
|
||||||
# set up bos and eos tokens (cls and sep)
|
|
||||||
self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
|
|
||||||
self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id)
|
|
||||||
|
|
||||||
# add vocab to gguf
|
# add vocab to gguf
|
||||||
self.gguf_writer.add_tokenizer_model("bert")
|
self.gguf_writer.add_tokenizer_model("bert")
|
||||||
self.gguf_writer.add_token_list(tokens)
|
self.gguf_writer.add_token_list(tokens)
|
||||||
self.gguf_writer.add_token_scores(scores)
|
|
||||||
self.gguf_writer.add_token_types(toktypes)
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
# handle special tokens
|
# handle special tokens
|
||||||
|
@ -2142,16 +2136,6 @@ class NomicBertModel(BertModel):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
||||||
|
|
||||||
def get_tensors(self):
|
|
||||||
assert self.vocab_size is not None
|
|
||||||
for name, data in super().get_tensors():
|
|
||||||
# Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
|
|
||||||
if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
|
|
||||||
rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
|
|
||||||
assert data.shape == (rounded_vocab_size, self.hparams["n_embd"])
|
|
||||||
data = data[:self.vocab_size, :]
|
|
||||||
yield name, data
|
|
||||||
|
|
||||||
|
|
||||||
@Model.register("GemmaForCausalLM")
|
@Model.register("GemmaForCausalLM")
|
||||||
class GemmaModel(Model):
|
class GemmaModel(Model):
|
||||||
|
@ -2327,7 +2311,8 @@ class MambaModel(Model):
|
||||||
data = data.astype(np.float32)
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
# if f16 desired, convert big float32 2-dim weight tensors to float16
|
# if f16 desired, convert big float32 2-dim weight tensors to float16
|
||||||
if self.ftype == 1 and data_dtype == np.float32 and new_name.removesuffix(".weight").endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
|
new_weight_name = new_name[:-len(".weight")] if new_name.endswith(".weight") else ""
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
|
||||||
data = data.astype(np.float16)
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
26
convert.py
26
convert.py
|
@ -33,7 +33,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from typing import TypeAlias
|
from typing_extensions import Self, TypeAlias
|
||||||
|
|
||||||
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
|
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
|
||||||
faulthandler.register(signal.SIGUSR1)
|
faulthandler.register(signal.SIGUSR1)
|
||||||
|
@ -139,7 +139,8 @@ class GGMLFileType(enum.IntEnum):
|
||||||
dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
|
dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
|
||||||
if dt is None:
|
if dt is None:
|
||||||
raise ValueError(self)
|
raise ValueError(self)
|
||||||
# 1D tensors are always F32.
|
# Convert all 1D tensors to F32. Most of the codebase that takes in 1D tensors only handles F32 tensors, and most of the outputs tensors are F32.
|
||||||
|
# Also The 1d tensors aren't much of a performance/size issue. So instead of having to have separate F32 and F16 implementations of both, just convert everything to F32 for now.
|
||||||
return dt if len(tensor.shape) > 1 else DT_F32
|
return dt if len(tensor.shape) > 1 else DT_F32
|
||||||
|
|
||||||
|
|
||||||
|
@ -516,7 +517,7 @@ class LlamaHfVocab(Vocab):
|
||||||
tokenizer_model = "llama"
|
tokenizer_model = "llama"
|
||||||
name = "hfft"
|
name = "hfft"
|
||||||
|
|
||||||
def __init__(self, base_path: Path, ignore_nonllama: bool = False):
|
def __init__(self, base_path: Path):
|
||||||
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
||||||
# if this fails, FileNotFoundError propagates to caller
|
# if this fails, FileNotFoundError propagates to caller
|
||||||
with open(fname_tokenizer, encoding='utf-8') as f:
|
with open(fname_tokenizer, encoding='utf-8') as f:
|
||||||
|
@ -524,9 +525,7 @@ class LlamaHfVocab(Vocab):
|
||||||
|
|
||||||
# pre-check so we know if we need transformers
|
# pre-check so we know if we need transformers
|
||||||
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
||||||
if ignore_nonllama:
|
if (
|
||||||
pass # workaround incorrect use of this class for WordPiece
|
|
||||||
elif (
|
|
||||||
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
|
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
|
||||||
or tokenizer_json['decoder']['type'] != 'Sequence'
|
or tokenizer_json['decoder']['type'] != 'Sequence'
|
||||||
):
|
):
|
||||||
|
@ -646,16 +645,17 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
||||||
|
|
||||||
|
|
||||||
class Tensor(ABC):
|
class Tensor(ABC):
|
||||||
|
ndarray: NDArray
|
||||||
data_type: DataType
|
data_type: DataType
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def astype(self, data_type: DataType) -> Tensor: ...
|
def astype(self, data_type: DataType) -> Self: ...
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
|
def permute(self, n_head: int, n_head_kv: int) -> Self: ...
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
|
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> Self: ...
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def part(self, n_part: int) -> UnquantizedTensor: ...
|
def part(self, n_part: int) -> Self: ...
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def to_ggml(self) -> GGMLCompatibleTensor: ...
|
def to_ggml(self) -> GGMLCompatibleTensor: ...
|
||||||
|
|
||||||
|
@ -672,13 +672,13 @@ class UnquantizedTensor(Tensor):
|
||||||
self.ndarray = ndarray
|
self.ndarray = ndarray
|
||||||
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
|
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
|
||||||
|
|
||||||
def astype(self, data_type: DataType) -> Tensor:
|
def astype(self, data_type: DataType) -> UnquantizedTensor:
|
||||||
dtype = data_type.dtype
|
dtype = data_type.dtype
|
||||||
if self.data_type == DT_BF16:
|
if self.data_type == DT_BF16:
|
||||||
self.ndarray = bf16_to_fp32(self.ndarray)
|
self.ndarray = bf16_to_fp32(self.ndarray)
|
||||||
return UnquantizedTensor(self.ndarray.astype(dtype))
|
return UnquantizedTensor(self.ndarray.astype(dtype))
|
||||||
|
|
||||||
def to_ggml(self) -> UnquantizedTensor:
|
def to_ggml(self) -> Self:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
|
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
|
||||||
|
@ -1350,7 +1350,7 @@ def load_some_model(path: Path) -> ModelPlus:
|
||||||
# Be extra-friendly and accept either a file or a directory:
|
# Be extra-friendly and accept either a file or a directory:
|
||||||
if path.is_dir():
|
if path.is_dir():
|
||||||
# Check if it's a set of safetensors files first
|
# Check if it's a set of safetensors files first
|
||||||
globs = ["model-00001-of-*.safetensors", "model.safetensors"]
|
globs = ["model-00001-of-*.safetensors", "model.safetensors", "consolidated.safetensors"]
|
||||||
files = [file for glob in globs for file in path.glob(glob)]
|
files = [file for glob in globs for file in path.glob(glob)]
|
||||||
if not files:
|
if not files:
|
||||||
# Try the PyTorch patterns too, with lower priority
|
# Try the PyTorch patterns too, with lower priority
|
||||||
|
|
119
docs/HOWTO-add-model.md
Normal file
119
docs/HOWTO-add-model.md
Normal file
|
@ -0,0 +1,119 @@
|
||||||
|
## Add a new model architecture to `llama.cpp`
|
||||||
|
|
||||||
|
Adding a model requires few steps:
|
||||||
|
|
||||||
|
1. Convert the model to GGUF
|
||||||
|
2. Define the model architecture in `llama.cpp`
|
||||||
|
3. Build the GGML graph implementation
|
||||||
|
|
||||||
|
After following these steps, you can open PR.
|
||||||
|
|
||||||
|
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
|
||||||
|
- [main](../examples/main)
|
||||||
|
- [imatrix](../examples/imatrix)
|
||||||
|
- [quantize](../examples/quantize)
|
||||||
|
- [server](../examples/server)
|
||||||
|
|
||||||
|
### 1. Convert the model to GGUF
|
||||||
|
|
||||||
|
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
|
||||||
|
Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
|
||||||
|
|
||||||
|
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
|
||||||
|
|
||||||
|
The required steps to implement for an HF model are:
|
||||||
|
|
||||||
|
1. Define the model `Model.register` annotation in a new `Model` subclass, example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
@Model.register("MyModelForCausalLM")
|
||||||
|
class MyModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.GROK
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py)
|
||||||
|
|
||||||
|
Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.
|
||||||
|
|
||||||
|
Example for `falcon` model:
|
||||||
|
```python
|
||||||
|
MODEL_ARCH.FALCON: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_NORM_2,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Map the original tensor names to the standardize equivalent in GGUF
|
||||||
|
|
||||||
|
As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.
|
||||||
|
|
||||||
|
Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file.
|
||||||
|
|
||||||
|
If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.
|
||||||
|
|
||||||
|
Example for the normalization tensor in attention layers:
|
||||||
|
|
||||||
|
```python
|
||||||
|
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||||
|
# Attention norm
|
||||||
|
MODEL_TENSOR.ATTN_NORM: (
|
||||||
|
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
||||||
|
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen
|
||||||
|
"transformer.blocks.{bid}.norm_1", # mpt
|
||||||
|
...
|
||||||
|
)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF.
|
||||||
|
|
||||||
|
Depending on the model configuration, tokenizer, code and tensors layout, you will have to override:
|
||||||
|
- `Model#set_gguf_parameters`
|
||||||
|
- `Model#set_vocab`
|
||||||
|
- `Model#write_tensors`
|
||||||
|
|
||||||
|
NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights.
|
||||||
|
|
||||||
|
### 2. Define the model architecture in `llama.cpp`
|
||||||
|
|
||||||
|
The model params and tensors layout must be defined in `llama.cpp`:
|
||||||
|
1. Define a new `llm_arch`
|
||||||
|
2. Define the tensors layout in `LLM_TENSOR_NAMES`
|
||||||
|
3. Add any non standard metadata in `llm_load_hparams`
|
||||||
|
4. Create the tensors for inference in `llm_load_tensors`
|
||||||
|
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
|
||||||
|
|
||||||
|
NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
|
||||||
|
|
||||||
|
### 3. Build the GGML graph implementation
|
||||||
|
|
||||||
|
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
|
||||||
|
|
||||||
|
Have a look to existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
|
||||||
|
|
||||||
|
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.
|
||||||
|
|
||||||
|
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
|
||||||
|
|
||||||
|
## GGUF specification
|
||||||
|
|
||||||
|
https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
|
||||||
|
|
||||||
|
## Resources
|
||||||
|
|
||||||
|
- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
|
||||||
|
- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
|
||||||
|
- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
|
||||||
|
- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
|
||||||
|
- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
|
||||||
|
- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
|
||||||
|
- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
|
||||||
|
- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
|
||||||
|
- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948
|
|
@ -19,6 +19,7 @@ else()
|
||||||
add_subdirectory(benchmark)
|
add_subdirectory(benchmark)
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
|
add_subdirectory(eval-callback)
|
||||||
add_subdirectory(finetune)
|
add_subdirectory(finetune)
|
||||||
add_subdirectory(gritlm)
|
add_subdirectory(gritlm)
|
||||||
add_subdirectory(gguf-split)
|
add_subdirectory(gguf-split)
|
||||||
|
|
|
@ -123,10 +123,10 @@ int main(int argc, char ** argv) {
|
||||||
inputs.push_back(inp);
|
inputs.push_back(inp);
|
||||||
}
|
}
|
||||||
|
|
||||||
// add eos if not present
|
// add SEP if not present
|
||||||
for (auto & inp : inputs) {
|
for (auto & inp : inputs) {
|
||||||
if (inp.empty() || inp.back() != llama_token_eos(model)) {
|
if (inp.empty() || inp.back() != llama_token_sep(model)) {
|
||||||
inp.push_back(llama_token_eos(model));
|
inp.push_back(llama_token_sep(model));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
9
examples/eval-callback/CMakeLists.txt
Normal file
9
examples/eval-callback/CMakeLists.txt
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
set(TARGET eval-callback)
|
||||||
|
add_executable(${TARGET} eval-callback.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
||||||
|
set(TEST_TARGET test-eval-callback)
|
||||||
|
add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
||||||
|
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
95
examples/eval-callback/README.md
Normal file
95
examples/eval-callback/README.md
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
# llama.cpp/examples/eval-callback
|
||||||
|
|
||||||
|
A simple example which demonstrates how to use callback during the inference.
|
||||||
|
It simply prints to the console all operations and tensor data.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
eval-callback \
|
||||||
|
--hf-repo ggml-org/models \
|
||||||
|
--hf-file phi-2/ggml-model-q4_0.gguf \
|
||||||
|
--model phi-2-q4_0.gguf \
|
||||||
|
--prompt hello \
|
||||||
|
--seed 42 \
|
||||||
|
-ngl 33
|
||||||
|
```
|
||||||
|
|
||||||
|
Will print:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
llm_load_tensors: offloaded 33/33 layers to GPU
|
||||||
|
...
|
||||||
|
llama_new_context_with_model: n_ctx = 512
|
||||||
|
...
|
||||||
|
llama_new_context_with_model: CUDA0 compute buffer size = 105.00 MiB
|
||||||
|
llama_new_context_with_model: CUDA_Host compute buffer size = 6.01 MiB
|
||||||
|
llama_new_context_with_model: graph nodes = 1225
|
||||||
|
llama_new_context_with_model: graph splits = 2
|
||||||
|
ggml_debug: inp_embd = (f32) GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -0.0181, 0.0272, 0.0272, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: norm-0 = (f32) NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -0.6989, 1.0636, 1.0636, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: norm_w-0 = (f32) MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -0.1800, 0.2817, 0.2632, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: attn_norm-0 = (f32) ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -0.1863, 0.2970, 0.2604, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: wqkv-0 = (f32) MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -1.1238, 1.2876, -1.8086, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: bqkv-0 = (f32) ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -1.1135, 1.4604, -1.9226, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: bqkv-0 (view) = (f32) VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -1.1135, 1.4604, -1.9226, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: Qcur-0 = (f32) CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -1.1135, 1.4604, -1.9226, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: Qcur-0 (reshaped) = (f32) RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -1.1135, 1.4604, -1.9226, ...],
|
||||||
|
[ -0.3608, 0.5076, -1.8866, ...],
|
||||||
|
[ 1.7643, 0.0273, -2.1065, ...],
|
||||||
|
...
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: Qcur-0 = (f32) ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -1.1135, 1.4604, -1.9226, ...],
|
||||||
|
[ -0.3608, 0.5076, -1.8866, ...],
|
||||||
|
[ 1.7643, 0.0273, -2.1065, ...],
|
||||||
|
...
|
||||||
|
],
|
||||||
|
]
|
||||||
|
```
|
185
examples/eval-callback/eval-callback.cpp
Normal file
185
examples/eval-callback/eval-callback.cpp
Normal file
|
@ -0,0 +1,185 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <random>
|
||||||
|
#include <string>
|
||||||
|
#include <tuple>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This the arbitrary data which will be passed to each callback.
|
||||||
|
* Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
|
||||||
|
*/
|
||||||
|
struct callback_data {
|
||||||
|
std::vector<uint8_t> data;
|
||||||
|
};
|
||||||
|
|
||||||
|
static std::string ggml_ne_string(const ggml_tensor * t) {
|
||||||
|
std::string str;
|
||||||
|
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||||
|
str += std::to_string(t->ne[i]);
|
||||||
|
if (i + 1 < GGML_MAX_DIMS) {
|
||||||
|
str += ", ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
|
||||||
|
float sum = 0;
|
||||||
|
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||||
|
printf(" [\n");
|
||||||
|
for (int64_t i2 = 0; i2 < ne[2] && i2 < n; i2++) {
|
||||||
|
printf(" [\n");
|
||||||
|
for (int64_t i1 = 0; i1 < ne[1] && i1 < n; i1++) {
|
||||||
|
printf(" [");
|
||||||
|
for (int64_t i0 = 0; i0 < ne[0] && i0 < n; i0++) {
|
||||||
|
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||||
|
float v;
|
||||||
|
if (type == GGML_TYPE_F16) {
|
||||||
|
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
|
||||||
|
} else if (type == GGML_TYPE_F32) {
|
||||||
|
v = *(float *) data + i;
|
||||||
|
} else if (type == GGML_TYPE_I32) {
|
||||||
|
v = (float) *(int32_t *) data + i;
|
||||||
|
} else if (type == GGML_TYPE_I16) {
|
||||||
|
v = (float) *(int16_t *) data + i;
|
||||||
|
} else if (type == GGML_TYPE_I8) {
|
||||||
|
v = (float) *(int8_t *) data + i;
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
printf("%8.4f", v);
|
||||||
|
sum += v;
|
||||||
|
if (i0 < ne[0] - 1 && i0 < n - 1) printf(", ");
|
||||||
|
}
|
||||||
|
if (ne[0] > n) printf(", ...");
|
||||||
|
printf("],\n");
|
||||||
|
}
|
||||||
|
if (ne[1] > n) printf(" ...\n");
|
||||||
|
printf(" ],\n");
|
||||||
|
}
|
||||||
|
if (ne[2] > n) printf(" ...\n");
|
||||||
|
printf(" ]\n");
|
||||||
|
printf(" sum = %f\n", sum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GGML operations callback during the graph execution.
|
||||||
|
*
|
||||||
|
* @param t current tensor
|
||||||
|
* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
||||||
|
* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
|
||||||
|
* see ggml_backend_sched_eval_callback
|
||||||
|
* @param user_data user data to pass at each call back
|
||||||
|
* @return true to receive data or continue the graph, false otherwise
|
||||||
|
*/
|
||||||
|
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||||
|
auto * cb_data = (callback_data *) user_data;
|
||||||
|
|
||||||
|
const struct ggml_tensor * src0 = t->src[0];
|
||||||
|
const struct ggml_tensor * src1 = t->src[1];
|
||||||
|
|
||||||
|
if (ask) {
|
||||||
|
return true; // Always retrieve data
|
||||||
|
}
|
||||||
|
|
||||||
|
char src1_str[128] = {0};
|
||||||
|
if (src1) {
|
||||||
|
sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
|
||||||
|
t->name, ggml_type_name(t->type), ggml_op_name(t->op),
|
||||||
|
src0->name, ggml_ne_string(src0).c_str(),
|
||||||
|
src1 ? src1_str : "",
|
||||||
|
ggml_ne_string(t).c_str());
|
||||||
|
|
||||||
|
|
||||||
|
// copy the data from the GPU memory if needed
|
||||||
|
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
|
||||||
|
|
||||||
|
if (!is_host) {
|
||||||
|
auto n_bytes = ggml_nbytes(t);
|
||||||
|
cb_data->data.resize(n_bytes);
|
||||||
|
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ggml_is_quantized(t->type)) {
|
||||||
|
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
|
||||||
|
ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool run(llama_context * ctx, const gpt_params & params) {
|
||||||
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
|
||||||
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
|
|
||||||
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
|
||||||
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
|
||||||
|
callback_data cb_data;
|
||||||
|
|
||||||
|
gpt_params params;
|
||||||
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
print_build_info();
|
||||||
|
|
||||||
|
std::mt19937 rng(params.seed);
|
||||||
|
if (params.random_prompt) {
|
||||||
|
params.prompt = gpt_random_prompt(rng);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_backend_init();
|
||||||
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
|
// pass the callback to the backend scheduler
|
||||||
|
// it will be executed for each node during the graph computation
|
||||||
|
params.cb_eval = ggml_debug;
|
||||||
|
params.cb_eval_user_data = &cb_data;
|
||||||
|
params.warmup = false;
|
||||||
|
|
||||||
|
// init
|
||||||
|
llama_model * model;
|
||||||
|
llama_context * ctx;
|
||||||
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
if (model == nullptr || ctx == nullptr) {
|
||||||
|
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// print system information
|
||||||
|
{
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
bool OK = run(ctx, params);
|
||||||
|
if (!OK) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_print_timings(ctx);
|
||||||
|
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -17,7 +17,7 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
|
||||||
size_t pos = 0;
|
size_t pos = 0;
|
||||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||||
auto prev_stacks = grammar->stacks;
|
auto prev_stacks = grammar->stacks;
|
||||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
|
||||||
if (grammar->stacks.empty()) {
|
if (grammar->stacks.empty()) {
|
||||||
error_pos = pos;
|
error_pos = pos;
|
||||||
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
|
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
|
||||||
|
|
|
@ -142,7 +142,7 @@ static bool gguf_ex_read_0(const std::string & fname) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// read and create ggml_context containing the tensors and their data
|
// read and create ggml_context containing the tensors and their data
|
||||||
static bool gguf_ex_read_1(const std::string & fname) {
|
static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
||||||
struct ggml_context * ctx_data = NULL;
|
struct ggml_context * ctx_data = NULL;
|
||||||
|
|
||||||
struct gguf_init_params params = {
|
struct gguf_init_params params = {
|
||||||
|
@ -206,7 +206,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
|
||||||
printf("\n\n");
|
printf("\n\n");
|
||||||
|
|
||||||
// check data
|
// check data
|
||||||
{
|
if (check_data) {
|
||||||
const float * data = (const float *) cur->data;
|
const float * data = (const float *) cur->data;
|
||||||
for (int j = 0; j < ggml_nelements(cur); ++j) {
|
for (int j = 0; j < ggml_nelements(cur); ++j) {
|
||||||
if (data[j] != 100 + i) {
|
if (data[j] != 100 + i) {
|
||||||
|
@ -229,9 +229,16 @@ static bool gguf_ex_read_1(const std::string & fname) {
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
if (argc < 3) {
|
if (argc < 3) {
|
||||||
printf("usage: %s data.gguf r|w\n", argv[0]);
|
printf("usage: %s data.gguf r|w [n]\n", argv[0]);
|
||||||
|
printf("r: read data.gguf file\n");
|
||||||
|
printf("w: write data.gguf file\n");
|
||||||
|
printf("n: no check of tensor data\n");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
bool check_data = true;
|
||||||
|
if (argc == 4) {
|
||||||
|
check_data = false;
|
||||||
|
}
|
||||||
|
|
||||||
const std::string fname(argv[1]);
|
const std::string fname(argv[1]);
|
||||||
const std::string mode (argv[2]);
|
const std::string mode (argv[2]);
|
||||||
|
@ -242,7 +249,7 @@ int main(int argc, char ** argv) {
|
||||||
GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
|
GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
|
||||||
} else if (mode == "r") {
|
} else if (mode == "r") {
|
||||||
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
|
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
|
||||||
GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
|
GGML_ASSERT(gguf_ex_read_1(fname, check_data) && "failed to read gguf file");
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -349,12 +349,13 @@ static void process_logits(
|
||||||
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
|
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||||
|
@ -596,24 +597,18 @@ int main(int argc, char ** argv) {
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
llama_model_params mparams = llama_model_params_from_gpt_params(params);
|
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
|
|
||||||
if (model == NULL) {
|
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
|
||||||
|
|
||||||
// pass the callback to the backend scheduler
|
// pass the callback to the backend scheduler
|
||||||
// it will be executed for each node during the graph computation
|
// it will be executed for each node during the graph computation
|
||||||
cparams.cb_eval = ik_collect_imatrix;
|
params.cb_eval = ik_collect_imatrix;
|
||||||
cparams.cb_eval_user_data = NULL;
|
params.cb_eval_user_data = NULL;
|
||||||
|
params.warmup = false;
|
||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(model, cparams);
|
// init
|
||||||
if (ctx == NULL) {
|
llama_model * model;
|
||||||
fprintf(stderr, "%s: error: unable to create context\n", __func__);
|
llama_context * ctx;
|
||||||
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
if (model == nullptr || ctx == nullptr) {
|
||||||
|
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -239,6 +239,7 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("%s\n", get_system_info(params).c_str());
|
LOG_TEE("%s\n", get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
|
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||||
LOG("add_bos: %d\n", add_bos);
|
LOG("add_bos: %d\n", add_bos);
|
||||||
|
|
||||||
bool suff_rm_leading_spc = params.escape;
|
bool suff_rm_leading_spc = params.escape;
|
||||||
|
@ -279,10 +280,10 @@ int main(int argc, char ** argv) {
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
||||||
|
|
||||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
|
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
|
||||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
||||||
|
|
||||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
||||||
|
|
||||||
original_prompt_len = original_inp.size();
|
original_prompt_len = original_inp.size();
|
||||||
|
|
|
@ -146,7 +146,6 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
|
|
||||||
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
|
|
||||||
|
|
||||||
std::string system_prompt, user_prompt;
|
std::string system_prompt, user_prompt;
|
||||||
size_t image_pos = prompt.find("<image>");
|
size_t image_pos = prompt.find("<image>");
|
||||||
|
@ -180,7 +179,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
|
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
|
||||||
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
|
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
|
||||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||||
|
|
||||||
|
|
|
@ -64,13 +64,10 @@ int main(int argc, char ** argv) {
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
// Tokenize the prompt
|
// Tokenize the prompt
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
|
||||||
LOG("add_bos tgt: %d\n", add_bos);
|
|
||||||
|
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
std::vector<llama_token> all;
|
std::vector<llama_token> all;
|
||||||
|
|
||||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
all = inp;
|
all = inp;
|
||||||
|
|
||||||
const int max_context_size = llama_n_ctx(ctx);
|
const int max_context_size = llama_n_ctx(ctx);
|
||||||
|
|
|
@ -28,10 +28,8 @@ int main(int argc, char ** argv){
|
||||||
GGML_ASSERT(model != nullptr);
|
GGML_ASSERT(model != nullptr);
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
|
||||||
|
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
fprintf(stderr, "%s: tokenization done\n", __func__);
|
fprintf(stderr, "%s: tokenization done\n", __func__);
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -34,11 +34,8 @@ int main(int argc, char ** argv){
|
||||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
|
||||||
LOG("add_bos tgt: %d\n", add_bos);
|
|
||||||
|
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
|
|
||||||
llama_ngram_cache ngram_cache_context;
|
llama_ngram_cache ngram_cache_context;
|
||||||
llama_ngram_cache ngram_cache_dynamic;
|
llama_ngram_cache ngram_cache_dynamic;
|
||||||
|
|
|
@ -42,11 +42,8 @@ int main(int argc, char ** argv){
|
||||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
|
||||||
LOG("add_bos tgt: %d\n", add_bos);
|
|
||||||
|
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
|
|
||||||
llama_ngram_cache ngram_cache_context;
|
llama_ngram_cache ngram_cache_context;
|
||||||
llama_ngram_cache ngram_cache_dynamic;
|
llama_ngram_cache ngram_cache_dynamic;
|
||||||
|
|
|
@ -246,6 +246,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
|
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||||
LOG("add_bos: %d\n", add_bos);
|
LOG("add_bos: %d\n", add_bos);
|
||||||
|
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
|
@ -255,7 +256,7 @@ int main(int argc, char ** argv) {
|
||||||
if (params.chatml) {
|
if (params.chatml) {
|
||||||
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
|
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
|
||||||
}
|
}
|
||||||
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
} else {
|
} else {
|
||||||
LOG("use session tokens\n");
|
LOG("use session tokens\n");
|
||||||
embd_inp = session_tokens;
|
embd_inp = session_tokens;
|
||||||
|
@ -277,10 +278,10 @@ int main(int argc, char ** argv) {
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
||||||
|
|
||||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
|
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
|
||||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
||||||
|
|
||||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||||
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
||||||
|
|
||||||
original_prompt_len = original_inp.size();
|
original_prompt_len = original_inp.size();
|
||||||
|
@ -339,14 +340,14 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// prefix & suffix for instruct mode
|
// prefix & suffix for instruct mode
|
||||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
|
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true);
|
||||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
|
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
|
||||||
|
|
||||||
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
||||||
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
||||||
|
|
||||||
// chatml prefix & suffix
|
// chatml prefix & suffix
|
||||||
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true);
|
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
|
||||||
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
|
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
|
||||||
|
|
||||||
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
|
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
|
||||||
|
|
|
@ -315,10 +315,11 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||||
|
|
||||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
|
@ -454,6 +455,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||||
|
|
||||||
std::ofstream logits_stream;
|
std::ofstream logits_stream;
|
||||||
if (!params.logits_file.empty()) {
|
if (!params.logits_file.empty()) {
|
||||||
|
@ -470,7 +472,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||||
|
@ -771,9 +773,6 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
|
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
|
||||||
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
|
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
|
||||||
|
|
||||||
// This is needed as usual for LLaMA models
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
|
||||||
|
|
||||||
// The tasks should be randomized so the score stabilizes quickly.
|
// The tasks should be randomized so the score stabilizes quickly.
|
||||||
bool randomize_tasks = true;
|
bool randomize_tasks = true;
|
||||||
|
|
||||||
|
@ -818,7 +817,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
||||||
for (size_t j = 0; j < 4; j++) {
|
for (size_t j = 0; j < 4; j++) {
|
||||||
hs_cur.ending[j] = prompt_lines[idx*6+2+j];
|
hs_cur.ending[j] = prompt_lines[idx*6+2+j];
|
||||||
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], add_bos);
|
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// determine the common prefix of the endings
|
// determine the common prefix of the endings
|
||||||
|
@ -837,7 +836,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
|
hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
|
||||||
hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
|
hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
|
||||||
|
|
||||||
//GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, add_bos).size());
|
//GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, true).size());
|
||||||
|
|
||||||
// Delete the selected random example from the prompt
|
// Delete the selected random example from the prompt
|
||||||
if (randomize_tasks) {
|
if (randomize_tasks) {
|
||||||
|
@ -1110,12 +1109,9 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||||
|
|
||||||
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
|
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
|
||||||
|
|
||||||
// This is needed as usual for LLaMA models
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
|
||||||
|
|
||||||
for (auto & task : data) {
|
for (auto & task : data) {
|
||||||
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, add_bos);
|
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
|
||||||
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, add_bos);
|
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true);
|
||||||
|
|
||||||
task.common_prefix = 0;
|
task.common_prefix = 0;
|
||||||
for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
|
for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
|
||||||
|
@ -1130,8 +1126,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||||
task.seq_tokens[0].size() - task.common_prefix +
|
task.seq_tokens[0].size() - task.common_prefix +
|
||||||
task.seq_tokens[1].size() - task.common_prefix;
|
task.seq_tokens[1].size() - task.common_prefix;
|
||||||
|
|
||||||
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], add_bos).size();
|
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size();
|
||||||
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], add_bos).size();
|
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
|
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
|
||||||
|
@ -1322,7 +1318,7 @@ struct multiple_choice_task {
|
||||||
std::vector<float> log_probs;
|
std::vector<float> log_probs;
|
||||||
};
|
};
|
||||||
|
|
||||||
static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) {
|
static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
|
||||||
if (task.question.empty() || task.mc1.answers.empty()) {
|
if (task.question.empty() || task.mc1.answers.empty()) {
|
||||||
if (log_error) {
|
if (log_error) {
|
||||||
printf("%s: found bad task with empty question and/or answers\n", __func__);
|
printf("%s: found bad task with empty question and/or answers\n", __func__);
|
||||||
|
@ -1337,7 +1333,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos,
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos));
|
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true));
|
||||||
}
|
}
|
||||||
auto min_len = task.seq_tokens.front().size();
|
auto min_len = task.seq_tokens.front().size();
|
||||||
for (auto& seq : task.seq_tokens) {
|
for (auto& seq : task.seq_tokens) {
|
||||||
|
@ -1436,9 +1432,6 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||||
n_task = params.multiple_choice_tasks;
|
n_task = params.multiple_choice_tasks;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is needed as usual for LLaMA models
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
|
||||||
|
|
||||||
printf("%s: preparing task data", __func__);
|
printf("%s: preparing task data", __func__);
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
if (n_task > 500) {
|
if (n_task > 500) {
|
||||||
|
@ -1446,7 +1439,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
std::atomic<int> counter(0);
|
std::atomic<int> counter(0);
|
||||||
std::atomic<int> n_bad(0);
|
std::atomic<int> n_bad(0);
|
||||||
auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () {
|
auto prepare = [&counter, &n_bad, &tasks, ctx] () {
|
||||||
int num_tasks = tasks.size();
|
int num_tasks = tasks.size();
|
||||||
int n_bad_local = 0;
|
int n_bad_local = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -1457,7 +1450,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||||
}
|
}
|
||||||
int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
|
int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
|
||||||
for (int i = first; i < last; ++i) {
|
for (int i = first; i < last; ++i) {
|
||||||
if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local;
|
if (!multiple_choice_prepare_one_task(ctx, tasks[i], false)) ++n_bad_local;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -1479,7 +1472,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||||
int i_task = 0;
|
int i_task = 0;
|
||||||
for (auto& task : tasks) {
|
for (auto& task : tasks) {
|
||||||
++i_task;
|
++i_task;
|
||||||
if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) {
|
if (!multiple_choice_prepare_one_task(ctx, task, true)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (i_task%n_dot == 0) {
|
if (i_task%n_dot == 0) {
|
||||||
|
@ -1715,6 +1708,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||||
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
|
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
|
||||||
const int nv = 2*((n_vocab + 1)/2) + 4;
|
const int nv = 2*((n_vocab + 1)/2) + 4;
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||||
|
|
||||||
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
|
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
|
||||||
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
|
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -51,6 +51,26 @@
|
||||||
margin-bottom: 0.5em;
|
margin-bottom: 0.5em;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
button, input, textarea, .button, a.button, select {
|
||||||
|
color: #666;
|
||||||
|
border: 1px solid #ddd;
|
||||||
|
border-radius: 4px;
|
||||||
|
line-height: 1.5em;
|
||||||
|
padding: 0.25em 0.25em;
|
||||||
|
text-decoration: none;
|
||||||
|
font-size: 1.1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
button {
|
||||||
|
border: 1px solid #2a8aad;
|
||||||
|
background: #3584e4;
|
||||||
|
font-weight: normal;
|
||||||
|
color: #fff;
|
||||||
|
}
|
||||||
|
button:disabled {
|
||||||
|
background: #9cbce5;
|
||||||
|
}
|
||||||
|
|
||||||
#write form {
|
#write form {
|
||||||
margin: 1em 0 0 0;
|
margin: 1em 0 0 0;
|
||||||
display: flex;
|
display: flex;
|
||||||
|
@ -406,7 +426,7 @@
|
||||||
throw new Error("already running");
|
throw new Error("already running");
|
||||||
}
|
}
|
||||||
controller.value = new AbortController();
|
controller.value = new AbortController();
|
||||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: document.baseURI.replace(/\/+$/, '') })) {
|
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
|
||||||
const data = chunk.data;
|
const data = chunk.data;
|
||||||
|
|
||||||
if (data.stop) {
|
if (data.stop) {
|
||||||
|
@ -567,7 +587,7 @@
|
||||||
runCompletion();
|
runCompletion();
|
||||||
}
|
}
|
||||||
return html`
|
return html`
|
||||||
<div>
|
<div class="right">
|
||||||
<button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
|
<button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
|
||||||
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
|
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
|
||||||
<button onclick=${reset}>Reset</button>
|
<button onclick=${reset}>Reset</button>
|
||||||
|
@ -1015,6 +1035,10 @@
|
||||||
}
|
}
|
||||||
|
|
||||||
function App(props) {
|
function App(props) {
|
||||||
|
useEffect(() => {
|
||||||
|
const query = new URLSearchParams(location.search).get("q");
|
||||||
|
if (query) chat(query);
|
||||||
|
}, []);
|
||||||
|
|
||||||
return html`
|
return html`
|
||||||
<div class="mode-${session.value.type}">
|
<div class="mode-${session.value.type}">
|
||||||
|
|
|
@ -689,6 +689,7 @@ struct server_context {
|
||||||
n_ctx = llama_n_ctx(ctx);
|
n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
add_bos_token = llama_should_add_bos_token(model);
|
add_bos_token = llama_should_add_bos_token(model);
|
||||||
|
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -758,7 +759,7 @@ struct server_context {
|
||||||
metrics.init();
|
metrics.init();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const {
|
std::vector<llama_token> tokenize(const json & json_prompt, bool add_special) const {
|
||||||
// TODO: currently, we tokenize using special tokens by default
|
// TODO: currently, we tokenize using special tokens by default
|
||||||
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
|
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
|
||||||
// but it's better compared to completely ignoring ChatML and other chat templates
|
// but it's better compared to completely ignoring ChatML and other chat templates
|
||||||
|
@ -776,7 +777,7 @@ struct server_context {
|
||||||
|
|
||||||
std::vector<llama_token> p;
|
std::vector<llama_token> p;
|
||||||
if (first) {
|
if (first) {
|
||||||
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
||||||
first = false;
|
first = false;
|
||||||
} else {
|
} else {
|
||||||
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||||
|
@ -793,7 +794,7 @@ struct server_context {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
auto s = json_prompt.template get<std::string>();
|
auto s = json_prompt.template get<std::string>();
|
||||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
return prompt_tokens;
|
return prompt_tokens;
|
||||||
|
@ -1058,7 +1059,7 @@ struct server_context {
|
||||||
system_tokens.clear();
|
system_tokens.clear();
|
||||||
|
|
||||||
if (!system_prompt.empty()) {
|
if (!system_prompt.empty()) {
|
||||||
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
|
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
|
@ -1914,7 +1915,7 @@ struct server_context {
|
||||||
prefix_tokens.push_back(llama_token_middle(model));
|
prefix_tokens.push_back(llama_token_middle(model));
|
||||||
prompt_tokens = prefix_tokens;
|
prompt_tokens = prefix_tokens;
|
||||||
} else {
|
} else {
|
||||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
|
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.n_past = 0;
|
slot.n_past = 0;
|
||||||
|
|
|
@ -76,6 +76,28 @@ int main(int argc, char ** argv) {
|
||||||
params.n_threads_batch = params.n_threads_batch_draft;
|
params.n_threads_batch = params.n_threads_batch_draft;
|
||||||
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
|
||||||
|
LOG("vocab_type tgt: %d\n", vocab_type_tgt);
|
||||||
|
|
||||||
|
const bool vocab_type_dft = llama_vocab_type(model_dft);
|
||||||
|
LOG("vocab_type dft: %d\n", vocab_type_dft);
|
||||||
|
|
||||||
|
if (vocab_type_tgt != vocab_type_dft) {
|
||||||
|
fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__);
|
||||||
|
fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
|
||||||
|
llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
|
||||||
|
llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
|
||||||
|
llama_token_eos(model_tgt) != llama_token_eos(model_dft)
|
||||||
|
) {
|
||||||
|
fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
const int n_vocab_tgt = llama_n_vocab(model_tgt);
|
const int n_vocab_tgt = llama_n_vocab(model_tgt);
|
||||||
const int n_vocab_dft = llama_n_vocab(model_dft);
|
const int n_vocab_dft = llama_n_vocab(model_dft);
|
||||||
|
@ -105,20 +127,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
|
|
||||||
// Tokenize the prompt
|
// Tokenize the prompt
|
||||||
const bool add_bos_tgt = llama_should_add_bos_token(model_tgt);
|
|
||||||
LOG("add_bos tgt: %d\n", add_bos_tgt);
|
|
||||||
|
|
||||||
const bool add_bos_dft = llama_should_add_bos_token(model_dft);
|
|
||||||
LOG("add_bos dft: %d\n", add_bos_dft);
|
|
||||||
|
|
||||||
if (add_bos_tgt != add_bos_dft) {
|
|
||||||
fprintf(stderr, "%s: error: draft model add_bos must match target model to use speculation but ", __func__);
|
|
||||||
fprintf(stderr, "add_bos_dft = %d while add_bos_tgt = %d\n", add_bos_dft, add_bos_tgt);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true);
|
inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true);
|
||||||
|
|
||||||
const int max_context_size = llama_n_ctx(ctx_tgt);
|
const int max_context_size = llama_n_ctx(ctx_tgt);
|
||||||
const int max_tokens_list_size = max_context_size - 4;
|
const int max_tokens_list_size = max_context_size - 4;
|
||||||
|
|
|
@ -26,11 +26,9 @@ int main(int argc, char ** argv) {
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
|
||||||
|
|
||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
|
|
||||||
tokens = ::llama_tokenize(model, prompt, add_bos, true);
|
tokens = ::llama_tokenize(model, prompt, true, true);
|
||||||
|
|
||||||
for (int i = 0; i < (int) tokens.size(); i++) {
|
for (int i = 0; i < (int) tokens.size(); i++) {
|
||||||
if (printing_ids) {
|
if (printing_ids) {
|
||||||
|
|
|
@ -1225,7 +1225,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||||
|
|
||||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||||
// ldc == nrows of the matrix that cuBLAS writes into
|
// ldc == nrows of the matrix that cuBLAS writes into
|
||||||
int ldc = id == ctx.device ? ne0 : row_diff;
|
int64_t ldc = id == ctx.device ? ne0 : row_diff;
|
||||||
|
|
||||||
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
||||||
|
|
||||||
|
@ -1377,8 +1377,8 @@ static void ggml_cuda_op_mul_mat(
|
||||||
const int64_t ne0 = dst->ne[0];
|
const int64_t ne0 = dst->ne[0];
|
||||||
const int64_t ne1 = dst->ne[1];
|
const int64_t ne1 = dst->ne[1];
|
||||||
|
|
||||||
const int nb2 = dst->nb[2];
|
const int64_t nb2 = dst->nb[2];
|
||||||
const int nb3 = dst->nb[3];
|
const int64_t nb3 = dst->nb[3];
|
||||||
|
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
|
||||||
|
|
|
@ -394,7 +394,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
||||||
// TODO: move to ggml-common.h
|
// TODO: move to ggml-common.h
|
||||||
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||||
|
|
||||||
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
||||||
|
|
||||||
|
|
||||||
//////////////////////
|
//////////////////////
|
||||||
|
|
|
@ -4,14 +4,14 @@
|
||||||
#define CUDA_Q8_0_NE_ALIGN 2048
|
#define CUDA_Q8_0_NE_ALIGN 2048
|
||||||
|
|
||||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||||
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
||||||
const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
const int64_t i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
||||||
|
|
||||||
if (i >= k) {
|
if (i >= k) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int ib = i/qk; // block index
|
const int64_t ib = i/qk; // block index
|
||||||
const int iqs = (i%qk)/qr; // quant index
|
const int iqs = (i%qk)/qr; // quant index
|
||||||
const int iybs = i - i%qk; // y block start index
|
const int iybs = i - i%qk; // y block start index
|
||||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||||
|
@ -25,7 +25,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
|
||||||
}
|
}
|
||||||
|
|
||||||
template <bool need_check>
|
template <bool need_check>
|
||||||
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
|
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
|
||||||
#if __CUDA_ARCH__ >= CC_PASCAL
|
#if __CUDA_ARCH__ >= CC_PASCAL
|
||||||
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
||||||
|
|
||||||
|
@ -68,13 +68,13 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int64_t i = blockIdx.x;
|
||||||
|
|
||||||
// assume 32 threads
|
// assume 32 threads
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int il = tid/8;
|
const int il = tid/8;
|
||||||
const int ir = tid%8;
|
const int ir = tid%8;
|
||||||
const int ib = 8*i + ir;
|
const int64_t ib = 8*i + ir;
|
||||||
if (ib >= nb32) {
|
if (ib >= nb32) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -96,13 +96,13 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int64_t i = blockIdx.x;
|
||||||
|
|
||||||
// assume 32 threads
|
// assume 32 threads
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int il = tid/8;
|
const int il = tid/8;
|
||||||
const int ir = tid%8;
|
const int ir = tid%8;
|
||||||
const int ib = 8*i + ir;
|
const int64_t ib = 8*i + ir;
|
||||||
if (ib >= nb32) {
|
if (ib >= nb32) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -313,14 +313,14 @@ template<typename dst_t>
|
||||||
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||||
const block_q6_K * x = (const block_q6_K *) vx;
|
const block_q6_K * x = (const block_q6_K *) vx;
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int64_t i = blockIdx.x;
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
|
|
||||||
// assume 64 threads - this is very slightly better than the one below
|
// assume 64 threads - this is very slightly better than the one below
|
||||||
const int tid = threadIdx.x;
|
const int64_t tid = threadIdx.x;
|
||||||
const int ip = tid/32; // ip is 0 or 1
|
const int64_t ip = tid/32; // ip is 0 or 1
|
||||||
const int il = tid - 32*ip; // 0...32
|
const int64_t il = tid - 32*ip; // 0...32
|
||||||
const int is = 8*ip + il/16;
|
const int64_t is = 8*ip + il/16;
|
||||||
|
|
||||||
dst_t * y = yy + i*QK_K + 128*ip + il;
|
dst_t * y = yy + i*QK_K + 128*ip + il;
|
||||||
|
|
||||||
|
@ -337,9 +337,9 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
||||||
#else
|
#else
|
||||||
|
|
||||||
// assume 32 threads
|
// assume 32 threads
|
||||||
const int tid = threadIdx.x;
|
const int64_t tid = threadIdx.x;
|
||||||
const int ip = tid/16; // 0 or 1
|
const int64_t ip = tid/16; // 0 or 1
|
||||||
const int il = tid - 16*ip; // 0...15
|
const int64_t il = tid - 16*ip; // 0...15
|
||||||
|
|
||||||
dst_t * y = yy + i*QK_K + 16*ip + il;
|
dst_t * y = yy + i*QK_K + 16*ip + il;
|
||||||
|
|
||||||
|
@ -571,12 +571,12 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||||
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||||
const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
|
const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
|
||||||
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
|
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||||
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
|
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
|
||||||
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
|
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
|
||||||
const bool need_check = false;
|
const bool need_check = false;
|
||||||
|
@ -588,7 +588,7 @@ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half *
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
@ -598,7 +598,7 @@ static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cu
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
@ -608,27 +608,27 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb32 = k / 32;
|
const int nb32 = k / 32;
|
||||||
const int nb = (k + 255) / 256;
|
const int nb = (k + 255) / 256;
|
||||||
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb32 = k / 32;
|
const int nb32 = k / 32;
|
||||||
const int nb = (k + 255) / 256;
|
const int nb = (k + 255) / 256;
|
||||||
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
@ -638,7 +638,7 @@ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cu
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
@ -648,55 +648,55 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = (k + QK_K - 1) / QK_K;
|
const int nb = (k + QK_K - 1) / QK_K;
|
||||||
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = (k + QK_K - 1) / QK_K;
|
const int nb = (k + QK_K - 1) / QK_K;
|
||||||
#if QK_K == 64
|
#if QK_K == 64
|
||||||
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
|
@ -706,8 +706,8 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k,
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename src_t, typename dst_t>
|
template <typename src_t, typename dst_t>
|
||||||
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
||||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
if (i >= k) {
|
if (i >= k) {
|
||||||
return;
|
return;
|
||||||
|
@ -719,7 +719,7 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename src_t, typename dst_t>
|
template <typename src_t, typename dst_t>
|
||||||
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||||
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
|
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
|
||||||
|
|
||||||
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
||||||
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
|
|
||||||
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const block_q4_0 * x = (const block_q4_0 *) vx;
|
const block_q4_0 * x = (const block_q4_0 *) vx;
|
||||||
|
|
||||||
const dfloat d = x[ib].d;
|
const dfloat d = x[ib].d;
|
||||||
|
@ -19,7 +19,7 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
|
||||||
#endif // GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const block_q4_1 * x = (const block_q4_1 *) vx;
|
const block_q4_1 * x = (const block_q4_1 *) vx;
|
||||||
|
|
||||||
const dfloat d = __low2half(x[ib].dm);
|
const dfloat d = __low2half(x[ib].dm);
|
||||||
|
@ -39,7 +39,7 @@ static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const in
|
||||||
#endif // GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const block_q5_0 * x = (const block_q5_0 *) vx;
|
const block_q5_0 * x = (const block_q5_0 *) vx;
|
||||||
|
|
||||||
const dfloat d = x[ib].d;
|
const dfloat d = x[ib].d;
|
||||||
|
@ -62,7 +62,7 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
|
||||||
#endif // GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const block_q5_1 * x = (const block_q5_1 *) vx;
|
const block_q5_1 * x = (const block_q5_1 *) vx;
|
||||||
|
|
||||||
const dfloat d = __low2half(x[ib].dm);
|
const dfloat d = __low2half(x[ib].dm);
|
||||||
|
@ -86,7 +86,7 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
|
||||||
#endif // GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const block_q8_0 * x = (const block_q8_0 *) vx;
|
const block_q8_0 * x = (const block_q8_0 *) vx;
|
||||||
|
|
||||||
const dfloat d = x[ib].d;
|
const dfloat d = x[ib].d;
|
||||||
|
|
|
@ -565,7 +565,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const half * x = (const half *) vx;
|
const half * x = (const half *) vx;
|
||||||
|
|
||||||
// automatic half -> float type cast if dfloat == float
|
// automatic half -> float type cast if dfloat == float
|
||||||
|
@ -577,7 +577,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
||||||
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
||||||
// qk = quantized weights per x block
|
// qk = quantized weights per x block
|
||||||
// qr = number of quantized weights per data value in x block
|
// qr = number of quantized weights per data value in x block
|
||||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
|
||||||
|
|
||||||
if (row >= nrows) {
|
if (row >= nrows) {
|
||||||
return;
|
return;
|
||||||
|
@ -598,7 +598,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
||||||
|
|
||||||
for (int i = 0; i < ncols; i += iter_stride) {
|
for (int i = 0; i < ncols; i += iter_stride) {
|
||||||
const int col = i + vals_per_iter*tid;
|
const int col = i + vals_per_iter*tid;
|
||||||
const int ib = (row*ncols + col)/qk; // x block index
|
const int64_t ib = ((int64_t)row*ncols + col)/qk; // x block index
|
||||||
const int iqs = (col%qk)/qr; // x quant index
|
const int iqs = (col%qk)/qr; // x quant index
|
||||||
const int iybs = col - col%qk; // y block start index
|
const int iybs = col - col%qk; // y block start index
|
||||||
|
|
||||||
|
|
|
@ -1,20 +1,20 @@
|
||||||
#include "quantize.cuh"
|
#include "quantize.cuh"
|
||||||
|
|
||||||
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
|
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) {
|
||||||
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
|
const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
if (ix >= kx_padded) {
|
if (ix >= kx_padded) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int iy = blockDim.y*blockIdx.y + threadIdx.y;
|
const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y;
|
||||||
|
|
||||||
const int i_padded = iy*kx_padded + ix;
|
const int64_t i_padded = (int64_t)iy*kx_padded + ix;
|
||||||
|
|
||||||
block_q8_1 * y = (block_q8_1 *) vy;
|
block_q8_1 * y = (block_q8_1 *) vy;
|
||||||
|
|
||||||
const int ib = i_padded / QK8_1; // block index
|
const int64_t ib = i_padded / QK8_1; // block index
|
||||||
const int iqs = i_padded % QK8_1; // quant index
|
const int64_t iqs = i_padded % QK8_1; // quant index
|
||||||
|
|
||||||
const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
|
const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
|
||||||
float amax = fabsf(xi);
|
float amax = fabsf(xi);
|
||||||
|
@ -36,8 +36,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
||||||
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
|
void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) {
|
||||||
const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
||||||
const dim3 num_blocks(block_num_x, ky, 1);
|
const dim3 num_blocks(block_num_x, ky, 1);
|
||||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
||||||
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
||||||
|
|
|
@ -2,4 +2,4 @@
|
||||||
|
|
||||||
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
||||||
|
|
||||||
void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream);
|
void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream);
|
||||||
|
|
310
ggml-quants.c
310
ggml-quants.c
File diff suppressed because it is too large
Load diff
148
ggml-quants.h
148
ggml-quants.h
|
@ -12,70 +12,70 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Quantization
|
// Quantization
|
||||||
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
|
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
|
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
|
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
|
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
|
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
|
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
|
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
|
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
|
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
|
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
|
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
|
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int k);
|
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int k);
|
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int k);
|
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
// Dequantization
|
// Dequantization
|
||||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
// Dot product
|
// Dot product
|
||||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
@ -101,26 +101,26 @@ void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
||||||
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||||
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
||||||
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
||||||
void iq2xs_init_impl(enum ggml_type type);
|
void iq2xs_init_impl(enum ggml_type type);
|
||||||
void iq2xs_free_impl(enum ggml_type type);
|
void iq2xs_free_impl(enum ggml_type type);
|
||||||
|
|
16
ggml.c
16
ggml.c
|
@ -338,14 +338,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
||||||
return GGML_FP32_TO_FP16(x);
|
return GGML_FP32_TO_FP16(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) {
|
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
|
||||||
for (int i = 0; i < n; i++) {
|
for (int64_t i = 0; i < n; i++) {
|
||||||
y[i] = GGML_FP16_TO_FP32(x[i]);
|
y[i] = GGML_FP16_TO_FP32(x[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
|
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
|
||||||
int i = 0;
|
int64_t i = 0;
|
||||||
#if defined(__F16C__)
|
#if defined(__F16C__)
|
||||||
for (; i + 7 < n; i += 8) {
|
for (; i + 7 < n; i += 8) {
|
||||||
__m256 x_vec = _mm256_loadu_ps(x + i);
|
__m256 x_vec = _mm256_loadu_ps(x + i);
|
||||||
|
@ -20331,11 +20331,11 @@ size_t ggml_quantize_chunk(
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
const float * src,
|
const float * src,
|
||||||
void * dst,
|
void * dst,
|
||||||
int start,
|
int64_t start,
|
||||||
int nrows,
|
int64_t nrows,
|
||||||
int n_per_row,
|
int64_t n_per_row,
|
||||||
const float * imatrix) {
|
const float * imatrix) {
|
||||||
const int n = nrows * n_per_row;
|
const int64_t n = (int64_t) nrows * n_per_row;
|
||||||
|
|
||||||
if (ggml_quantize_requires_imatrix(type)) {
|
if (ggml_quantize_requires_imatrix(type)) {
|
||||||
GGML_ASSERT(imatrix != NULL);
|
GGML_ASSERT(imatrix != NULL);
|
||||||
|
|
14
ggml.h
14
ggml.h
|
@ -332,8 +332,8 @@ extern "C" {
|
||||||
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
||||||
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
||||||
|
|
||||||
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
|
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
|
||||||
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
|
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
|
||||||
|
|
||||||
struct ggml_object;
|
struct ggml_object;
|
||||||
struct ggml_context;
|
struct ggml_context;
|
||||||
|
@ -2210,9 +2210,9 @@ extern "C" {
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
const float * src,
|
const float * src,
|
||||||
void * dst,
|
void * dst,
|
||||||
int start,
|
int64_t start,
|
||||||
int nrows,
|
int64_t nrows,
|
||||||
int n_per_row,
|
int64_t n_per_row,
|
||||||
const float * imatrix);
|
const float * imatrix);
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -2377,8 +2377,8 @@ extern "C" {
|
||||||
#else
|
#else
|
||||||
#define GGML_RESTRICT restrict
|
#define GGML_RESTRICT restrict
|
||||||
#endif
|
#endif
|
||||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||||
|
|
||||||
|
|
|
@ -639,6 +639,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_GATE,
|
MODEL_TENSOR.FFN_GATE,
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
],
|
],
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
|
|
@ -285,12 +285,14 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||||
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
||||||
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
||||||
|
"model.layers.{bid}.self_attn.q_norm", # cohere
|
||||||
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.ATTN_K_NORM: (
|
MODEL_TENSOR.ATTN_K_NORM: (
|
||||||
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
||||||
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
||||||
|
"model.layers.{bid}.self_attn.k_norm", # cohere
|
||||||
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
||||||
),
|
),
|
||||||
|
|
||||||
|
|
209
llama.cpp
209
llama.cpp
|
@ -318,6 +318,8 @@ enum llm_kv {
|
||||||
LLM_KV_TOKENIZER_UNK_ID,
|
LLM_KV_TOKENIZER_UNK_ID,
|
||||||
LLM_KV_TOKENIZER_SEP_ID,
|
LLM_KV_TOKENIZER_SEP_ID,
|
||||||
LLM_KV_TOKENIZER_PAD_ID,
|
LLM_KV_TOKENIZER_PAD_ID,
|
||||||
|
LLM_KV_TOKENIZER_CLS_ID,
|
||||||
|
LLM_KV_TOKENIZER_MASK_ID,
|
||||||
LLM_KV_TOKENIZER_ADD_BOS,
|
LLM_KV_TOKENIZER_ADD_BOS,
|
||||||
LLM_KV_TOKENIZER_ADD_EOS,
|
LLM_KV_TOKENIZER_ADD_EOS,
|
||||||
LLM_KV_TOKENIZER_ADD_PREFIX,
|
LLM_KV_TOKENIZER_ADD_PREFIX,
|
||||||
|
@ -388,6 +390,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
||||||
|
{ LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
|
||||||
|
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
||||||
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
||||||
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
||||||
|
@ -926,6 +930,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -1699,6 +1705,8 @@ enum e_model {
|
||||||
MODEL_MEDIUM,
|
MODEL_MEDIUM,
|
||||||
MODEL_LARGE,
|
MODEL_LARGE,
|
||||||
MODEL_XL,
|
MODEL_XL,
|
||||||
|
MODEL_8x7B,
|
||||||
|
MODEL_8x22B,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t kiB = 1024;
|
static const size_t kiB = 1024;
|
||||||
|
@ -2021,6 +2029,8 @@ struct llama_vocab {
|
||||||
id special_unk_id = 0;
|
id special_unk_id = 0;
|
||||||
id special_sep_id = -1;
|
id special_sep_id = -1;
|
||||||
id special_pad_id = -1;
|
id special_pad_id = -1;
|
||||||
|
id special_cls_id = -1;
|
||||||
|
id special_mask_id = -1;
|
||||||
|
|
||||||
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
||||||
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
||||||
|
@ -3550,6 +3560,8 @@ static const char * llama_model_type_name(e_model type) {
|
||||||
case MODEL_MEDIUM: return "0.4B";
|
case MODEL_MEDIUM: return "0.4B";
|
||||||
case MODEL_LARGE: return "0.8B";
|
case MODEL_LARGE: return "0.8B";
|
||||||
case MODEL_XL: return "1.5B";
|
case MODEL_XL: return "1.5B";
|
||||||
|
case MODEL_8x7B: return "8x7B";
|
||||||
|
case MODEL_8x22B: return "8x22B";
|
||||||
default: return "?B";
|
default: return "?B";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3664,6 +3676,13 @@ static void llm_load_hparams(
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
if (hparams.n_expert == 8) {
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 32: model.type = e_model::MODEL_8x7B; break;
|
||||||
|
case 56: model.type = e_model::MODEL_8x22B; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 22: model.type = e_model::MODEL_1B; break;
|
case 22: model.type = e_model::MODEL_1B; break;
|
||||||
case 26: model.type = e_model::MODEL_3B; break;
|
case 26: model.type = e_model::MODEL_3B; break;
|
||||||
|
@ -3674,6 +3693,7 @@ static void llm_load_hparams(
|
||||||
case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
|
case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_MINICPM:
|
case LLM_ARCH_MINICPM:
|
||||||
{
|
{
|
||||||
|
@ -3976,7 +3996,9 @@ static void llm_load_hparams(
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: This should probably be in llama.h
|
// TODO: This should probably be in llama.h
|
||||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
|
static std::vector<llama_vocab::id> llama_tokenize_internal(
|
||||||
|
const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special = false
|
||||||
|
);
|
||||||
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
||||||
|
|
||||||
static void llm_load_vocab(
|
static void llm_load_vocab(
|
||||||
|
@ -4003,6 +4025,8 @@ static void llm_load_vocab(
|
||||||
vocab.special_unk_id = -1;
|
vocab.special_unk_id = -1;
|
||||||
vocab.special_sep_id = -1;
|
vocab.special_sep_id = -1;
|
||||||
vocab.special_pad_id = -1;
|
vocab.special_pad_id = -1;
|
||||||
|
vocab.special_cls_id = -1;
|
||||||
|
vocab.special_mask_id = -1;
|
||||||
vocab.linefeed_id = -1;
|
vocab.linefeed_id = -1;
|
||||||
|
|
||||||
return;
|
return;
|
||||||
|
@ -4015,6 +4039,8 @@ static void llm_load_vocab(
|
||||||
vocab.special_unk_id = 0;
|
vocab.special_unk_id = 0;
|
||||||
vocab.special_sep_id = -1;
|
vocab.special_sep_id = -1;
|
||||||
vocab.special_pad_id = -1;
|
vocab.special_pad_id = -1;
|
||||||
|
vocab.special_cls_id = -1;
|
||||||
|
vocab.special_mask_id = -1;
|
||||||
|
|
||||||
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
||||||
if (add_space_prefix_keyidx != -1) {
|
if (add_space_prefix_keyidx != -1) {
|
||||||
|
@ -4054,15 +4080,19 @@ static void llm_load_vocab(
|
||||||
vocab.special_unk_id = -1;
|
vocab.special_unk_id = -1;
|
||||||
vocab.special_sep_id = -1;
|
vocab.special_sep_id = -1;
|
||||||
vocab.special_pad_id = -1;
|
vocab.special_pad_id = -1;
|
||||||
|
vocab.special_cls_id = -1;
|
||||||
|
vocab.special_mask_id = -1;
|
||||||
} else if (tokenizer_name == "bert") {
|
} else if (tokenizer_name == "bert") {
|
||||||
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
||||||
|
|
||||||
// default special tokens
|
// default special tokens
|
||||||
vocab.special_bos_id = 101;
|
vocab.special_bos_id = -1;
|
||||||
vocab.special_eos_id = 102;
|
vocab.special_eos_id = -1;
|
||||||
vocab.special_unk_id = 100;
|
vocab.special_unk_id = 100;
|
||||||
vocab.special_sep_id = -1;
|
vocab.special_sep_id = 102;
|
||||||
vocab.special_pad_id = -1;
|
vocab.special_pad_id = 0;
|
||||||
|
vocab.special_cls_id = 101;
|
||||||
|
vocab.special_mask_id = 103;
|
||||||
vocab.add_space_prefix = false;
|
vocab.add_space_prefix = false;
|
||||||
} else {
|
} else {
|
||||||
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
|
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
|
||||||
|
@ -4130,6 +4160,8 @@ static void llm_load_vocab(
|
||||||
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
||||||
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
||||||
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
||||||
|
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
|
||||||
|
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
|
||||||
};
|
};
|
||||||
for (const auto & it : special_token_types) {
|
for (const auto & it : special_token_types) {
|
||||||
const std::string & key = kv(std::get<0>(it));
|
const std::string & key = kv(std::get<0>(it));
|
||||||
|
@ -4326,6 +4358,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
||||||
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
||||||
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
||||||
|
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
||||||
|
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
||||||
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5406,6 +5440,11 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
|
if (n_layer >= 64){
|
||||||
|
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head});
|
||||||
|
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv});
|
||||||
|
}
|
||||||
|
|
||||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
@ -9454,6 +9493,31 @@ struct llm_build_context {
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (model.layers[il].attn_q_norm) {
|
||||||
|
Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
|
||||||
|
ggml_element_size(Qcur) * n_embd_head,
|
||||||
|
ggml_element_size(Qcur) * n_embd_head * n_head,
|
||||||
|
0);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
|
||||||
|
ggml_element_size(Kcur) * n_embd_head,
|
||||||
|
ggml_element_size(Kcur) * n_embd_head * n_head_kv,
|
||||||
|
0);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
||||||
|
model.layers[il].attn_q_norm,
|
||||||
|
NULL,
|
||||||
|
LLM_NORM, cb, il);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
||||||
|
model.layers[il].attn_k_norm,
|
||||||
|
NULL,
|
||||||
|
LLM_NORM, cb, il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
|
@ -11057,7 +11121,7 @@ struct llm_tokenizer_bpe {
|
||||||
add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
|
add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
|
||||||
}
|
}
|
||||||
|
|
||||||
// add the fnished tokens to the final list keeping correct order for next and prev
|
// add the finished tokens to the final list keeping correct order for next and prev
|
||||||
for (auto & sym : symbols) {
|
for (auto & sym : symbols) {
|
||||||
if (sym.n > 0) {
|
if (sym.n > 0) {
|
||||||
sym.prev = final_prev_index;
|
sym.prev = final_prev_index;
|
||||||
|
@ -11326,9 +11390,6 @@ struct llm_tokenizer_wpm {
|
||||||
output.push_back(vocab.special_unk_id);
|
output.push_back(vocab.special_unk_id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// append eos token
|
|
||||||
output.push_back(vocab.special_eos_id);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> preprocess(const std::string & text) {
|
std::vector<std::string> preprocess(const std::string & text) {
|
||||||
|
@ -11533,30 +11594,28 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
|
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) {
|
||||||
std::vector<llama_vocab::id> output;
|
std::vector<llama_vocab::id> output;
|
||||||
|
|
||||||
// OG tokenizer behavior:
|
|
||||||
//
|
|
||||||
// tokenizer.encode('', add_bos=True) returns [1]
|
|
||||||
// tokenizer.encode('', add_bos=False) returns []
|
|
||||||
|
|
||||||
if (bos && vocab.special_bos_id != -1) {
|
|
||||||
output.push_back(vocab.special_bos_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (raw_text.empty()) {
|
|
||||||
return output;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
||||||
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
|
||||||
|
|
||||||
if (special) tokenizer_st_partition(vocab, fragment_buffer);
|
if (!raw_text.empty()) {
|
||||||
|
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
||||||
|
if (parse_special) tokenizer_st_partition(vocab, fragment_buffer);
|
||||||
|
}
|
||||||
|
|
||||||
switch (vocab.type) {
|
switch (vocab.type) {
|
||||||
case LLAMA_VOCAB_TYPE_SPM:
|
case LLAMA_VOCAB_TYPE_SPM:
|
||||||
{
|
{
|
||||||
|
// OG tokenizer behavior:
|
||||||
|
//
|
||||||
|
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
||||||
|
// tokenizer.encode('', add_special_tokens=False) returns []
|
||||||
|
|
||||||
|
if (add_special && vocab.special_add_bos != 0) {
|
||||||
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||||
|
output.push_back(vocab.special_bos_id);
|
||||||
|
}
|
||||||
|
|
||||||
for (const auto & fragment : fragment_buffer) {
|
for (const auto & fragment : fragment_buffer) {
|
||||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||||
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
||||||
|
@ -11582,9 +11641,19 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
output.push_back(fragment.token);
|
output.push_back(fragment.token);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (add_special && vocab.special_add_eos == 1) {
|
||||||
|
GGML_ASSERT(vocab.special_eos_id != -1);
|
||||||
|
output.push_back(vocab.special_eos_id);
|
||||||
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLAMA_VOCAB_TYPE_BPE:
|
case LLAMA_VOCAB_TYPE_BPE:
|
||||||
{
|
{
|
||||||
|
if (add_special && vocab.special_add_bos == 1) {
|
||||||
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||||
|
output.push_back(vocab.special_bos_id);
|
||||||
|
}
|
||||||
|
|
||||||
for (const auto & fragment : fragment_buffer) {
|
for (const auto & fragment : fragment_buffer) {
|
||||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||||
|
@ -11598,9 +11667,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
output.push_back(fragment.token);
|
output.push_back(fragment.token);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(vocab.special_add_eos != 1);
|
||||||
} break;
|
} break;
|
||||||
case LLAMA_VOCAB_TYPE_WPM:
|
case LLAMA_VOCAB_TYPE_WPM:
|
||||||
{
|
{
|
||||||
|
if (add_special) {
|
||||||
|
GGML_ASSERT(vocab.special_cls_id != -1);
|
||||||
|
output.push_back(vocab.special_cls_id);
|
||||||
|
}
|
||||||
|
|
||||||
for (const auto & fragment : fragment_buffer) {
|
for (const auto & fragment : fragment_buffer) {
|
||||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||||
|
@ -11614,6 +11690,11 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
output.push_back(fragment.token);
|
output.push_back(fragment.token);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (add_special) {
|
||||||
|
GGML_ASSERT(vocab.special_sep_id != -1);
|
||||||
|
output.push_back(vocab.special_sep_id);
|
||||||
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLAMA_VOCAB_TYPE_NONE:
|
case LLAMA_VOCAB_TYPE_NONE:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -11780,7 +11861,9 @@ static void llama_grammar_advance_stack(
|
||||||
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
||||||
|
|
||||||
if (stack.empty()) {
|
if (stack.empty()) {
|
||||||
|
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
||||||
new_stacks.emplace_back(stack);
|
new_stacks.emplace_back(stack);
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11817,7 +11900,10 @@ static void llama_grammar_advance_stack(
|
||||||
}
|
}
|
||||||
case LLAMA_GRETYPE_CHAR:
|
case LLAMA_GRETYPE_CHAR:
|
||||||
case LLAMA_GRETYPE_CHAR_NOT:
|
case LLAMA_GRETYPE_CHAR_NOT:
|
||||||
|
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
||||||
|
// only add the stack if it's not a duplicate of one we already have
|
||||||
new_stacks.emplace_back(stack);
|
new_stacks.emplace_back(stack);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
||||||
|
@ -11831,12 +11917,13 @@ static void llama_grammar_advance_stack(
|
||||||
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
||||||
// produces the N possible stacks if the given char is accepted at those
|
// produces the N possible stacks if the given char is accepted at those
|
||||||
// positions
|
// positions
|
||||||
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
void llama_grammar_accept(
|
||||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||||
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
||||||
const uint32_t chr) {
|
const uint32_t chr,
|
||||||
|
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
||||||
|
|
||||||
std::vector<std::vector<const llama_grammar_element *>> new_stacks;
|
new_stacks.clear();
|
||||||
|
|
||||||
for (const auto & stack : stacks) {
|
for (const auto & stack : stacks) {
|
||||||
if (stack.empty()) {
|
if (stack.empty()) {
|
||||||
|
@ -11855,8 +11942,6 @@ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
||||||
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return new_stacks;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
||||||
|
@ -11870,6 +11955,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
||||||
const std::vector<llama_grammar_candidate> & candidates) {
|
const std::vector<llama_grammar_candidate> & candidates) {
|
||||||
|
|
||||||
std::vector<llama_grammar_candidate> rejects;
|
std::vector<llama_grammar_candidate> rejects;
|
||||||
|
rejects.reserve(candidates.size());
|
||||||
|
|
||||||
if (stack.empty()) {
|
if (stack.empty()) {
|
||||||
for (const auto & tok : candidates) {
|
for (const auto & tok : candidates) {
|
||||||
|
@ -11883,6 +11969,8 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
||||||
const llama_grammar_element * stack_pos = stack.back();
|
const llama_grammar_element * stack_pos = stack.back();
|
||||||
|
|
||||||
std::vector<llama_grammar_candidate> next_candidates;
|
std::vector<llama_grammar_candidate> next_candidates;
|
||||||
|
next_candidates.reserve(candidates.size());
|
||||||
|
|
||||||
for (const auto & tok : candidates) {
|
for (const auto & tok : candidates) {
|
||||||
if (*tok.code_points == 0) {
|
if (*tok.code_points == 0) {
|
||||||
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
||||||
|
@ -12690,8 +12778,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
||||||
// Note terminating 0 in decoded string
|
// Note terminating 0 in decoded string
|
||||||
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
||||||
const auto & code_points = decoded.first;
|
const auto & code_points = decoded.first;
|
||||||
|
std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
|
||||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
|
||||||
|
grammar->stacks = tmp_new_stacks;
|
||||||
}
|
}
|
||||||
grammar->partial_utf8 = decoded.second;
|
grammar->partial_utf8 = decoded.second;
|
||||||
GGML_ASSERT(!grammar->stacks.empty());
|
GGML_ASSERT(!grammar->stacks.empty());
|
||||||
|
@ -13323,9 +13413,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||||
return new_type;
|
return new_type;
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
||||||
std::mutex mutex;
|
std::mutex mutex;
|
||||||
int counter = 0;
|
int64_t counter = 0;
|
||||||
size_t new_size = 0;
|
size_t new_size = 0;
|
||||||
if (nthread < 2) {
|
if (nthread < 2) {
|
||||||
// single-thread
|
// single-thread
|
||||||
|
@ -13333,11 +13423,11 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
||||||
}
|
}
|
||||||
auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
|
auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
|
||||||
nrows, n_per_row, imatrix]() {
|
nrows, n_per_row, imatrix]() {
|
||||||
const int nrows_per_chunk = chunk_size / n_per_row;
|
const int64_t nrows_per_chunk = chunk_size / n_per_row;
|
||||||
size_t local_size = 0;
|
size_t local_size = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
std::unique_lock<std::mutex> lock(mutex);
|
std::unique_lock<std::mutex> lock(mutex);
|
||||||
int first_row = counter; counter += nrows_per_chunk;
|
int64_t first_row = counter; counter += nrows_per_chunk;
|
||||||
if (first_row >= nrows) {
|
if (first_row >= nrows) {
|
||||||
if (local_size > 0) {
|
if (local_size > 0) {
|
||||||
new_size += local_size;
|
new_size += local_size;
|
||||||
|
@ -13345,7 +13435,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
||||||
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -13468,7 +13558,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
const std::string name = ggml_get_name(meta);
|
const std::string name = ggml_get_name(meta);
|
||||||
|
|
||||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||||
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
if (name.find("attn_v.weight") != std::string::npos ||
|
||||||
|
name.find("attn_qkv.weight") != std::string::npos) {
|
||||||
++qs.n_attention_wv;
|
++qs.n_attention_wv;
|
||||||
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
||||||
qs.has_output = true;
|
qs.has_output = true;
|
||||||
|
@ -13478,7 +13569,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
||||||
|
|
||||||
// sanity checks
|
// sanity checks
|
||||||
GGML_ASSERT(qs.n_attention_wv == (int)model.hparams.n_layer && "n_attention_wv != n_layer is unexpected");
|
//
|
||||||
|
// - qs.n_attention_wv == 0 for Mamba models
|
||||||
|
// - qs.n_attention_wv == model.hparams.n_layer for Transformer models
|
||||||
|
//
|
||||||
|
GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
|
||||||
|
|
||||||
size_t total_size_org = 0;
|
size_t total_size_org = 0;
|
||||||
size_t total_size_new = 0;
|
size_t total_size_new = 0;
|
||||||
|
@ -13534,6 +13629,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
// quantize only 2D and 3D tensors (experts)
|
// quantize only 2D and 3D tensors (experts)
|
||||||
quantize &= (ggml_n_dims(tensor) >= 2);
|
quantize &= (ggml_n_dims(tensor) >= 2);
|
||||||
|
|
||||||
|
// do not quantize norm tensors
|
||||||
|
quantize &= name.find("_norm.weight") == std::string::npos;
|
||||||
|
|
||||||
quantize &= params->quantize_output_tensor || name != "output.weight";
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
||||||
quantize &= !params->only_copy;
|
quantize &= !params->only_copy;
|
||||||
|
|
||||||
|
@ -13580,7 +13679,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
new_size = ggml_nbytes(tensor);
|
new_size = ggml_nbytes(tensor);
|
||||||
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||||
} else {
|
} else {
|
||||||
const size_t nelements = ggml_nelements(tensor);
|
const int64_t nelements = ggml_nelements(tensor);
|
||||||
|
|
||||||
const float * imatrix = nullptr;
|
const float * imatrix = nullptr;
|
||||||
if (imatrix_data) {
|
if (imatrix_data) {
|
||||||
|
@ -13632,20 +13731,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
if (work.size() < nelements * 4) {
|
if (work.size() < (size_t)nelements * 4) {
|
||||||
work.resize(nelements * 4); // upper bound on size
|
work.resize(nelements * 4); // upper bound on size
|
||||||
}
|
}
|
||||||
new_data = work.data();
|
new_data = work.data();
|
||||||
|
|
||||||
const int n_per_row = tensor->ne[0];
|
const int64_t n_per_row = tensor->ne[0];
|
||||||
const int nrows = tensor->ne[1];
|
const int64_t nrows = tensor->ne[1];
|
||||||
|
|
||||||
static const int min_chunk_size = 32 * 512;
|
static const int64_t min_chunk_size = 32 * 512;
|
||||||
const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
||||||
|
|
||||||
const int nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
||||||
const int nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
||||||
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
||||||
|
|
||||||
// quantize each expert separately since they have different importance matrices
|
// quantize each expert separately since they have different importance matrices
|
||||||
new_size = 0;
|
new_size = 0;
|
||||||
|
@ -16063,6 +16162,14 @@ llama_token llama_token_eos(const struct llama_model * model) {
|
||||||
return model->vocab.special_eos_id;
|
return model->vocab.special_eos_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_token llama_token_cls(const struct llama_model * model) {
|
||||||
|
return model->vocab.special_cls_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token llama_token_sep(const struct llama_model * model) {
|
||||||
|
return model->vocab.special_sep_id;
|
||||||
|
}
|
||||||
|
|
||||||
llama_token llama_token_nl(const struct llama_model * model) {
|
llama_token llama_token_nl(const struct llama_model * model) {
|
||||||
return model->vocab.linefeed_id;
|
return model->vocab.linefeed_id;
|
||||||
}
|
}
|
||||||
|
@ -16097,9 +16204,9 @@ int32_t llama_tokenize(
|
||||||
int32_t text_len,
|
int32_t text_len,
|
||||||
llama_token * tokens,
|
llama_token * tokens,
|
||||||
int32_t n_tokens_max,
|
int32_t n_tokens_max,
|
||||||
bool add_bos,
|
bool add_special,
|
||||||
bool special) {
|
bool parse_special) {
|
||||||
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
|
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special);
|
||||||
|
|
||||||
if (n_tokens_max < (int) res.size()) {
|
if (n_tokens_max < (int) res.size()) {
|
||||||
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
||||||
|
|
15
llama.h
15
llama.h
|
@ -786,6 +786,8 @@ extern "C" {
|
||||||
// Special tokens
|
// Special tokens
|
||||||
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
||||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||||
|
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
|
||||||
|
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
|
||||||
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
||||||
|
|
||||||
// Returns -1 if unknown, 1 for true or 0 for false.
|
// Returns -1 if unknown, 1 for true or 0 for false.
|
||||||
|
@ -808,16 +810,16 @@ extern "C" {
|
||||||
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
||||||
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
||||||
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
||||||
/// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
|
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
||||||
/// Does not insert a leading space.
|
/// as plaintext. Does not insert a leading space.
|
||||||
LLAMA_API int32_t llama_tokenize(
|
LLAMA_API int32_t llama_tokenize(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const char * text,
|
const char * text,
|
||||||
int32_t text_len,
|
int32_t text_len,
|
||||||
llama_token * tokens,
|
llama_token * tokens,
|
||||||
int32_t n_tokens_max,
|
int32_t n_tokens_max,
|
||||||
bool add_bos,
|
bool add_special,
|
||||||
bool special);
|
bool parse_special);
|
||||||
|
|
||||||
// Token Id -> Piece.
|
// Token Id -> Piece.
|
||||||
// Uses the vocabulary in the provided context.
|
// Uses the vocabulary in the provided context.
|
||||||
|
@ -1095,10 +1097,11 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
||||||
struct llama_context * ctx
|
struct llama_context * ctx
|
||||||
);
|
);
|
||||||
|
|
||||||
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
void llama_grammar_accept(
|
||||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||||
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
||||||
const uint32_t chr);
|
const uint32_t chr,
|
||||||
|
std::vector<std::vector<const llama_grammar_element *>> & new_stacks);
|
||||||
|
|
||||||
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
||||||
const std::string & src,
|
const std::string & src,
|
||||||
|
|
9
scripts/gen-authors.sh
Executable file
9
scripts/gen-authors.sh
Executable file
|
@ -0,0 +1,9 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
printf "# date: $(date)\n" > AUTHORS
|
||||||
|
printf "# this file is auto-generated by scripts/gen-authors.sh\n\n" >> AUTHORS
|
||||||
|
|
||||||
|
git log --format='%an <%ae>' --reverse --date=short master | awk '!seen[$0]++' | sort >> AUTHORS
|
||||||
|
|
||||||
|
# if necessary, update your name here. for example: jdoe -> John Doe
|
||||||
|
sed -i '' 's/^jdoe/John Doe/g' AUTHORS
|
|
@ -3,9 +3,9 @@
|
||||||
# Shortcut for downloading HF models
|
# Shortcut for downloading HF models
|
||||||
#
|
#
|
||||||
# Usage:
|
# Usage:
|
||||||
# ./main -m $(./examples/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
# ./main -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||||
# ./main -m $(./examples/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
# ./main -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||||
# ./main -m $(./examples/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
# ./main -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||||
#
|
#
|
||||||
|
|
||||||
# all logs go to stderr
|
# all logs go to stderr
|
||||||
|
@ -14,7 +14,7 @@ function log {
|
||||||
}
|
}
|
||||||
|
|
||||||
function usage {
|
function usage {
|
||||||
log "Usage: $0 [[--url] <url>] [--repo <repo>] [--file <file>] [-h|--help]"
|
log "Usage: $0 [[--url] <url>] [--repo <repo>] [--file <file>] [--outdir <dir> [-h|--help]"
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -26,9 +26,9 @@ function has_cmd {
|
||||||
}
|
}
|
||||||
|
|
||||||
if has_cmd wget; then
|
if has_cmd wget; then
|
||||||
cmd="wget -q --show-progress -c -O %s %s"
|
cmd="wget -q --show-progress -c -O %s/%s %s"
|
||||||
elif has_cmd curl; then
|
elif has_cmd curl; then
|
||||||
cmd="curl -C - -f -o %s -L %s"
|
cmd="curl -C - -f --output-dir %s -o %s -L %s"
|
||||||
else
|
else
|
||||||
log "[E] curl or wget not found"
|
log "[E] curl or wget not found"
|
||||||
exit 1
|
exit 1
|
||||||
|
@ -37,6 +37,7 @@ fi
|
||||||
url=""
|
url=""
|
||||||
repo=""
|
repo=""
|
||||||
file=""
|
file=""
|
||||||
|
outdir="."
|
||||||
|
|
||||||
# parse args
|
# parse args
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
|
@ -53,6 +54,10 @@ while [[ $# -gt 0 ]]; do
|
||||||
file="$2"
|
file="$2"
|
||||||
shift 2
|
shift 2
|
||||||
;;
|
;;
|
||||||
|
--outdir)
|
||||||
|
outdir="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
-h|--help)
|
-h|--help)
|
||||||
usage
|
usage
|
||||||
;;
|
;;
|
||||||
|
@ -94,10 +99,10 @@ basename=$(basename $url)
|
||||||
log "[+] attempting to download $basename"
|
log "[+] attempting to download $basename"
|
||||||
|
|
||||||
if [ -n "$cmd" ]; then
|
if [ -n "$cmd" ]; then
|
||||||
cmd=$(printf "$cmd" "$basename" "$url")
|
cmd=$(printf "$cmd" "$outdir" "$basename" "$url")
|
||||||
log "[+] $cmd"
|
log "[+] $cmd"
|
||||||
if $cmd; then
|
if $cmd; then
|
||||||
echo $basename
|
echo $outdir/$basename
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
|
@ -66,6 +66,8 @@ while read c; do
|
||||||
tests/test-quantize-fns.cpp \
|
tests/test-quantize-fns.cpp \
|
||||||
tests/test-quantize-perf.cpp \
|
tests/test-quantize-perf.cpp \
|
||||||
tests/test-backend-ops.cpp \
|
tests/test-backend-ops.cpp \
|
||||||
|
LICENSE \
|
||||||
|
scripts/gen-authors.sh \
|
||||||
>> $SRC_LLAMA/ggml-src.patch
|
>> $SRC_LLAMA/ggml-src.patch
|
||||||
done < $SRC_LLAMA/ggml-commits
|
done < $SRC_LLAMA/ggml-commits
|
||||||
|
|
||||||
|
@ -123,6 +125,9 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||||
# tests/test-quantize-fns.cpp -> tests/test-quantize-fns.cpp
|
# tests/test-quantize-fns.cpp -> tests/test-quantize-fns.cpp
|
||||||
# tests/test-quantize-perf.cpp -> tests/test-quantize-perf.cpp
|
# tests/test-quantize-perf.cpp -> tests/test-quantize-perf.cpp
|
||||||
# tests/test-backend-ops.cpp -> tests/test-backend-ops.cpp
|
# tests/test-backend-ops.cpp -> tests/test-backend-ops.cpp
|
||||||
|
#
|
||||||
|
# LICENSE -> LICENSE
|
||||||
|
# scripts/gen-authors.sh -> scripts/gen-authors.sh
|
||||||
|
|
||||||
cat ggml-src.patch | sed \
|
cat ggml-src.patch | sed \
|
||||||
-e 's/src\/ggml\.c/ggml.c/g' \
|
-e 's/src\/ggml\.c/ggml.c/g' \
|
||||||
|
@ -156,6 +161,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||||
-e 's/tests\/test-quantize-fns\.cpp/tests\/test-quantize-fns.cpp/g' \
|
-e 's/tests\/test-quantize-fns\.cpp/tests\/test-quantize-fns.cpp/g' \
|
||||||
-e 's/tests\/test-quantize-perf\.cpp/tests\/test-quantize-perf.cpp/g' \
|
-e 's/tests\/test-quantize-perf\.cpp/tests\/test-quantize-perf.cpp/g' \
|
||||||
-e 's/tests\/test-backend-ops\.cpp/tests\/test-backend-ops.cpp/g' \
|
-e 's/tests\/test-backend-ops\.cpp/tests\/test-backend-ops.cpp/g' \
|
||||||
|
-e 's/LICENSE/LICENSE/g' \
|
||||||
|
-e 's/scripts\/gen-authors\.sh/scripts\/gen-authors.sh/g' \
|
||||||
> ggml-src.patch.tmp
|
> ggml-src.patch.tmp
|
||||||
mv ggml-src.patch.tmp ggml-src.patch
|
mv ggml-src.patch.tmp ggml-src.patch
|
||||||
|
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
bb8d8cff851b2de6fde4904be492d39458837e1a
|
98875cdb7e9ceeb726d1c196d2fecb3cbb59b93a
|
||||||
|
|
|
@ -31,3 +31,6 @@ cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h
|
||||||
cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp
|
cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp
|
||||||
cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp
|
cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp
|
||||||
cp -rpv ../ggml/tests/test-backend-ops.cpp ./tests/test-backend-ops.cpp
|
cp -rpv ../ggml/tests/test-backend-ops.cpp ./tests/test-backend-ops.cpp
|
||||||
|
|
||||||
|
cp -rpv ../LICENSE ./LICENSE
|
||||||
|
cp -rpv ../ggml/scripts/gen-authors.sh ./scripts/gen-authors.sh
|
||||||
|
|
|
@ -38,7 +38,7 @@ number ::= [0-9]+)""";
|
||||||
|
|
||||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||||
auto prev_stacks = grammar->stacks;
|
auto prev_stacks = grammar->stacks;
|
||||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
|
||||||
assert(!grammar->stacks.empty());
|
assert(!grammar->stacks.empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -138,7 +138,7 @@ ws ::= [ \t\n\r]?)""";
|
||||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||||
++pos;
|
++pos;
|
||||||
auto prev_stacks = grammar->stacks;
|
auto prev_stacks = grammar->stacks;
|
||||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
|
||||||
|
|
||||||
// Expect that each code point will not cause the grammar to fail
|
// Expect that each code point will not cause the grammar to fail
|
||||||
if (grammar->stacks.empty()) {
|
if (grammar->stacks.empty()) {
|
||||||
|
@ -173,7 +173,7 @@ ws ::= [ \t\n\r]?)""";
|
||||||
|
|
||||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||||
auto prev_stacks = grammar->stacks;
|
auto prev_stacks = grammar->stacks;
|
||||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
|
||||||
if (grammar->stacks.empty()) {
|
if (grammar->stacks.empty()) {
|
||||||
parse_failed = true;
|
parse_failed = true;
|
||||||
break;
|
break;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue