Merge remote-tracking branch 'origin/master' into json-faster-repetitions2
This commit is contained in:
commit
958bdda559
55 changed files with 4509 additions and 3231 deletions
22
.github/workflows/build.yml
vendored
22
.github/workflows/build.yml
vendored
|
@ -52,7 +52,7 @@ jobs:
|
|||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
ctest -L 'main|curl' --verbose --timeout 900
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
|
@ -101,7 +101,9 @@ jobs:
|
|||
sysctl -a
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
|
||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
|
||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
|
@ -209,21 +211,21 @@ jobs:
|
|||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON
|
||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
ctest -L 'main|curl' --verbose --timeout 900
|
||||
|
||||
- name: Test llama2c conversion
|
||||
id: llama2c_test
|
||||
|
@ -938,6 +940,12 @@ jobs:
|
|||
- name: Download artifacts
|
||||
id: download-artifact
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: ./artifact
|
||||
|
||||
- name: Move artifacts
|
||||
id: move_artifacts
|
||||
run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
|
||||
|
||||
- name: Create release
|
||||
id: create_release
|
||||
|
@ -956,7 +964,7 @@ jobs:
|
|||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
const release_id = '${{ steps.create_release.outputs.id }}';
|
||||
for (let file of await fs.readdirSync('./artifact')) {
|
||||
for (let file of await fs.readdirSync('./artifact/release')) {
|
||||
if (path.extname(file) === '.zip') {
|
||||
console.log('uploadReleaseAsset', file);
|
||||
await github.repos.uploadReleaseAsset({
|
||||
|
@ -964,7 +972,7 @@ jobs:
|
|||
repo: context.repo.repo,
|
||||
release_id: release_id,
|
||||
name: file,
|
||||
data: await fs.readFileSync(`./artifact/${file}`)
|
||||
data: await fs.readFileSync(`./artifact/release/${file}`)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -48,6 +48,7 @@ models-mnt
|
|||
/convert-llama2c-to-ggml
|
||||
/embd-input-test
|
||||
/embedding
|
||||
/eval-callback
|
||||
/gguf
|
||||
/gguf-llama-simple
|
||||
/gguf-split
|
||||
|
|
655
AUTHORS
Normal file
655
AUTHORS
Normal file
|
@ -0,0 +1,655 @@
|
|||
# date: Tue Apr 9 09:17:14 EEST 2024
|
||||
# this file is auto-generated by scripts/gen-authors.sh
|
||||
|
||||
0cc4m <picard12@live.de>
|
||||
0xspringtime <110655352+0xspringtime@users.noreply.github.com>
|
||||
2f38b454 <dxf@protonmail.com>
|
||||
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
|
||||
44670 <44670@users.noreply.github.com>
|
||||
AN Long <aisk@users.noreply.github.com>
|
||||
AT <manyoso@users.noreply.github.com>
|
||||
Aarni Koskela <akx@iki.fi>
|
||||
Aaron Miller <apage43@ninjawhale.com>
|
||||
Aaryaman Vasishta <aaryaman.vasishta@amd.com>
|
||||
Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
|
||||
Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com>
|
||||
Adithya Balaji <adithya.b94@gmail.com>
|
||||
AdithyanI <adithyan.i4internet@gmail.com>
|
||||
Adrian <smith.adriane@gmail.com>
|
||||
Adrian Hesketh <a-h@users.noreply.github.com>
|
||||
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
|
||||
Aisuko <urakiny@gmail.com>
|
||||
Alberto <57916483+albbus-stack@users.noreply.github.com>
|
||||
Alex <awhill19@icloud.com>
|
||||
Alex Azarov <alex@azarov.by>
|
||||
Alex Azarov <alexander.azarov@mapbox.com>
|
||||
Alex Klinkhamer <from.github.com.917@grencez.dev>
|
||||
Alex Klinkhamer <git@grencez.dev>
|
||||
Alex Nguyen <tiendung@users.noreply.github.com>
|
||||
Alex Petenchea <alex.petenchea@gmail.com>
|
||||
Alex Renda <alexrenda@users.noreply.github.com>
|
||||
Alex von Gluck IV <kallisti5@unixzen.com>
|
||||
Alexey Parfenov <zxed@alkatrazstudio.net>
|
||||
Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
|
||||
Ali Nehzat <ali.nehzat@thanks.dev>
|
||||
Ali Tariq <ali.tariq@10xengineers.ai>
|
||||
Alon <alonfaraj@gmail.com>
|
||||
AlpinDale <52078762+AlpinDale@users.noreply.github.com>
|
||||
AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
|
||||
Ananta Bastola <anantarajbastola@gmail.com>
|
||||
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
|
||||
András Salamon <ott2@users.noreply.github.com>
|
||||
Andrei <abetlen@gmail.com>
|
||||
Andrew Canis <andrew.canis@gmail.com>
|
||||
Andrew Duffy <a10y@users.noreply.github.com>
|
||||
Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
|
||||
Arik Poznanski <arikpoz@users.noreply.github.com>
|
||||
Artem <guinmoon@gmail.com>
|
||||
Artyom Lebedev <vagran.ast@gmail.com>
|
||||
Asbjørn Olling <asbjornolling@gmail.com>
|
||||
Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
|
||||
Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
|
||||
Ashraful Islam <ashraful.meche@gmail.com>
|
||||
Atsushi Tatsuma <yoshoku@outlook.com>
|
||||
Austin <77757836+teleprint-me@users.noreply.github.com>
|
||||
AustinMroz <austinmroz@utexas.edu>
|
||||
BADR <contact@pythops.com>
|
||||
Bach Le <bach@bullno1.com>
|
||||
Bailey Chittle <39804642+bachittle@users.noreply.github.com>
|
||||
BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
|
||||
Behnam M <58621210+ibehnam@users.noreply.github.com>
|
||||
Ben Garney <bengarney@users.noreply.github.com>
|
||||
Ben Siraphob <bensiraphob@gmail.com>
|
||||
Ben Williams <ben@719ben.com>
|
||||
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
|
||||
Bernat Vadell <hounter.caza@gmail.com>
|
||||
Bodo Graumann <mail@bodograumann.de>
|
||||
Bono Lv <lvscar@users.noreply.github.com>
|
||||
Borislav Stanimirov <b.stanimirov@abv.bg>
|
||||
Branden Butler <bwtbutler@hotmail.com>
|
||||
Brian <mofosyne@gmail.com>
|
||||
Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||
CJ Pais <cj@cjpais.com>
|
||||
CRD716 <crd716@gmail.com>
|
||||
Cameron <csteele@steelecameron.com>
|
||||
Cameron Kaiser <classilla@users.noreply.github.com>
|
||||
Casey Primozic <casey@cprimozic.net>
|
||||
Casey Primozic <me@ameo.link>
|
||||
CausalLM <148736309+CausalLM@users.noreply.github.com>
|
||||
Cebtenzzre <cebtenzzre@gmail.com>
|
||||
Chad Brewbaker <crb002@gmail.com>
|
||||
Cheng Shao <terrorjack@type.dance>
|
||||
Chris Kuehl <ckuehl@ckuehl.me>
|
||||
Christian Demsar <christian@github.email.demsar.us>
|
||||
Christian Demsar <crasm@git.vczf.us>
|
||||
Christian Falch <875252+chrfalch@users.noreply.github.com>
|
||||
Christian Kögler <ck3d@gmx.de>
|
||||
Clark Saben <76020733+csaben@users.noreply.github.com>
|
||||
Clint Herron <hanclinto@gmail.com>
|
||||
Cuong Trinh Manh <nguoithichkhampha@gmail.com>
|
||||
DAN™ <dranger003@gmail.com>
|
||||
Damian Stewart <d@damianstewart.com>
|
||||
Dane Madsen <dane_madsen@hotmail.com>
|
||||
DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
|
||||
Daniel Bevenius <daniel.bevenius@gmail.com>
|
||||
Daniel Drake <drake@endlessos.org>
|
||||
Daniel Hiltgen <dhiltgen@users.noreply.github.com>
|
||||
Daniel Illescas Romero <illescas.daniel@protonmail.com>
|
||||
DannyDaemonic <DannyDaemonic@gmail.com>
|
||||
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
|
||||
Dave Della Costa <ddellacosta+github@gmail.com>
|
||||
David Friehs <david@friehs.info>
|
||||
David Kennedy <dakennedyd@gmail.com>
|
||||
David Pflug <david@pflug.email>
|
||||
David Renshaw <dwrenshaw@gmail.com>
|
||||
David Sommers <12738+databyte@users.noreply.github.com>
|
||||
David Yang <davidyang6us@gmail.com>
|
||||
Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
|
||||
Dean <Dean.Sinaean@gmail.com>
|
||||
Deins <deinsegle@gmail.com>
|
||||
Didzis Gosko <didzis@users.noreply.github.com>
|
||||
Don Mahurin <dmahurin@users.noreply.github.com>
|
||||
DooWoong Lee (David) <manics99@naver.com>
|
||||
Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
|
||||
Douglas Hanley <thesecretaryofwar@gmail.com>
|
||||
Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
|
||||
Ebey Abraham <ebey97@gmail.com>
|
||||
Ed Lee <edilee@mozilla.com>
|
||||
Ed Lepedus <ed.lepedus@googlemail.com>
|
||||
Edward Taylor <edeetee@gmail.com>
|
||||
Elbios <141279586+Elbios@users.noreply.github.com>
|
||||
Engininja2 <139037756+Engininja2@users.noreply.github.com>
|
||||
Equim <sayaka@ekyu.moe>
|
||||
Eric Sommerlade <es0m@users.noreply.github.com>
|
||||
Eric Zhang <34133756+EZForever@users.noreply.github.com>
|
||||
Erik Garrison <erik.garrison@gmail.com>
|
||||
Erik Scholz <Green-Sky@users.noreply.github.com>
|
||||
Ettore Di Giacinto <mudler@users.noreply.github.com>
|
||||
Evan Jones <evan.q.jones@gmail.com>
|
||||
Evan Miller <emmiller@gmail.com>
|
||||
Eve <139727413+netrunnereve@users.noreply.github.com>
|
||||
Evgeny Kurnevsky <kurnevsky@gmail.com>
|
||||
Ewout ter Hoeven <E.M.terHoeven@student.tudelft.nl>
|
||||
ExtReMLapin <3909752+ExtReMLapin@users.noreply.github.com>
|
||||
FK <sozforex@gmail.com>
|
||||
Fabian <cmdrf@users.noreply.github.com>
|
||||
Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
|
||||
Faez Shakil <faez.shakil@gmail.com>
|
||||
FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
|
||||
Fattire <528174+fat-tire@users.noreply.github.com>
|
||||
Felix <stenbackfelix@gmail.com>
|
||||
Finn Voorhees <finnvoorhees@gmail.com>
|
||||
Firat <firatkiral@gmail.com>
|
||||
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
|
||||
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
|
||||
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
|
||||
FrankHB <frankhb1989@gmail.com>
|
||||
Frederik Vogel <Schaltfehler@users.noreply.github.com>
|
||||
Gabe Goodhart <gabe.l.hart@gmail.com>
|
||||
GainLee <perfecter.gen@gmail.com>
|
||||
Galunid <karolek1231456@gmail.com>
|
||||
Gary Linscott <glinscott@gmail.com>
|
||||
Gary Mulder <gjmulder@gmail.com>
|
||||
Genkagaku.GPT <hlhr202@163.com>
|
||||
Georgi Gerganov <ggerganov@gmail.com>
|
||||
Gilad S <giladgd@users.noreply.github.com>
|
||||
GiviMAD <GiviMAD@users.noreply.github.com>
|
||||
Govlzkoy <gotope@users.noreply.github.com>
|
||||
Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
|
||||
Guillaume Wenzek <gwenzek@users.noreply.github.com>
|
||||
Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
|
||||
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
|
||||
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
|
||||
Haohui Mai <ricetons@gmail.com>
|
||||
Haoxiang Fei <tonyfettes@tonyfettes.com>
|
||||
Harald Fernengel <harald.fernengel@here.com>
|
||||
Hatsune Miku <129688334+at8u@users.noreply.github.com>
|
||||
Henk Poley <HenkPoley@gmail.com>
|
||||
Henri Vasserman <henv@hot.ee>
|
||||
Henrik Forstén <henrik.forsten@gmail.com>
|
||||
Herman Semenov <GermanAizek@yandex.ru>
|
||||
Hesen Peng <hesen.peng@gmail.com>
|
||||
Hoang Nguyen <hugo53@users.noreply.github.com>
|
||||
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
|
||||
Howard Su <howard0su@gmail.com>
|
||||
Hua Jiang <allenhjiang@outlook.com>
|
||||
Huawei Lin <huaweilin.cs@gmail.com>
|
||||
Ian Bull <irbull@eclipsesource.com>
|
||||
Ian Bull <irbull@gmail.com>
|
||||
Ian Scrivener <github@zilogy.asia>
|
||||
Ido S <ido.pluto@gmail.com>
|
||||
IgnacioFDM <ignaciofdm@gmail.com>
|
||||
Igor Okulist <okigan@gmail.com>
|
||||
Ikko Eltociear Ashimine <eltociear@gmail.com>
|
||||
Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
|
||||
Ionoclast Laboratories <brigham@ionoclast.com>
|
||||
Isaac McFadyen <isaac@imcf.me>
|
||||
IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
|
||||
Ivan Komarov <Ivan.Komarov@dfyz.info>
|
||||
Ivan Stepanov <ivanstepanovftw@gmail.com>
|
||||
JH23X <165871467+JH23X@users.noreply.github.com>
|
||||
Jack Mousseau <jmousseau@users.noreply.github.com>
|
||||
JackJollimore <130917767+JackJollimore@users.noreply.github.com>
|
||||
Jag Chadha <jagtesh@gmail.com>
|
||||
Jakub N <jakubniemczyk97@gmail.com>
|
||||
James Reynolds <magnusviri@users.noreply.github.com>
|
||||
Jan Boon <jan.boon@kaetemi.be>
|
||||
Jan Boon <kaetemi@gmail.com>
|
||||
Jan Ploski <jpl@plosquare.com>
|
||||
Jannis Schönleber <joennlae@gmail.com>
|
||||
Jared Van Bortel <cebtenzzre@gmail.com>
|
||||
Jared Van Bortel <jared@nomic.ai>
|
||||
Jason McCartney <jmac@theroot.org>
|
||||
Jean-Christophe Hoelt <hoelt@fovea.cc>
|
||||
Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
|
||||
Jed Fox <git@jedfox.com>
|
||||
Jeffrey Quesnelle <emozilla@nousresearch.com>
|
||||
Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
|
||||
Jhen-Jie Hong <iainst0409@gmail.com>
|
||||
Jiahao Li <liplus17@163.com>
|
||||
Jian Liao <jianliao@users.noreply.github.com>
|
||||
JidongZhang-THU <1119708529@qq.com>
|
||||
Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
|
||||
Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
|
||||
Johannes Gäßler <johannesg@5d6.de>
|
||||
Johannes Rudolph <johannes.rudolph@gmail.com>
|
||||
John <78893154+cmp-nct@users.noreply.github.com>
|
||||
John Balis <phobossystems@gmail.com>
|
||||
John Smith <67539080+kingsidelee@users.noreply.github.com>
|
||||
JohnnyB <jboero@users.noreply.github.com>
|
||||
Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com>
|
||||
Jorge A <161275481+jorgealias@users.noreply.github.com>
|
||||
Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com>
|
||||
Joseph Stahl <1269177+josephst@users.noreply.github.com>
|
||||
Joyce <joycebrum@google.com>
|
||||
Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
|
||||
Judd <foldl@users.noreply.github.com>
|
||||
Julius Arkenberg <arki05@users.noreply.github.com>
|
||||
Jun Jie <71215065+junnjiee16@users.noreply.github.com>
|
||||
Juraj Bednar <juraj@bednar.io>
|
||||
Justin Parker <jparkerweb@gmail.com>
|
||||
Justin Suess <justin.suess@westpoint.edu>
|
||||
Justine Tunney <jtunney@gmail.com>
|
||||
Juuso Alasuutari <juuso.alasuutari@gmail.com>
|
||||
KASR <karim.asrih@gmail.com>
|
||||
Kamil Tomšík <info@tomsik.cz>
|
||||
Karsten Weiss <knweiss@gmail.com>
|
||||
Karthick <j.karthic2004@gmail.com>
|
||||
Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com>
|
||||
Karthik Sethuraman <k.seth1993@gmail.com>
|
||||
Kasumi <90275229+kasumi-1@users.noreply.github.com>
|
||||
Kawrakow <48489457+ikawrakow@users.noreply.github.com>
|
||||
Keiichi Tabata <keiichi.tabata@outlook.com>
|
||||
Kenvix ⭐ <kenvixzure@live.com>
|
||||
Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
|
||||
Kevin Ji <1146876+kevinji@users.noreply.github.com>
|
||||
Kevin Kwok <antimatter15@gmail.com>
|
||||
Kevin Lo <kevlo@kevlo.org>
|
||||
Kolen Cheung <ickc@users.noreply.github.com>
|
||||
Konstantin Herud <konstantin.herud@denkbares.com>
|
||||
Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
|
||||
Kunshang Ji <kunshang.ji@intel.com>
|
||||
Kyle Liang <liangmanlai@gmail.com>
|
||||
Kyle Mistele <kyle@mistele.com>
|
||||
Kylin <56434533+KyL0N@users.noreply.github.com>
|
||||
Lars Grammel <lars.grammel@gmail.com>
|
||||
Laura <Tijntje_7@msn.com>
|
||||
Lee <44310445+lx200916@users.noreply.github.com>
|
||||
Lee Drake <b.lee.drake@gmail.com>
|
||||
Leng Yue <lengyue@lengyue.me>
|
||||
LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
|
||||
Leonardo Neumann <leonardo@neumann.dev.br>
|
||||
Li Tan <tanliboy@gmail.com>
|
||||
Linwei Wang <wanix1988@gmail.com>
|
||||
LoganDark <github@logandark.mozmail.com>
|
||||
LostRuins <39025047+LostRuins@users.noreply.github.com>
|
||||
Luciano <lucianostrika44@gmail.com>
|
||||
Luo Tian <lt@basecity.com>
|
||||
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
|
||||
Maarten ter Huurne <maarten@treewalker.org>
|
||||
Mack Straight <eiz@users.noreply.github.com>
|
||||
Maël Kerbiriou <m431.kerbiriou@gmail.com>
|
||||
MaggotHATE <clay1326@gmail.com>
|
||||
Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
|
||||
Marco Matthies <71844+marcom@users.noreply.github.com>
|
||||
Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
|
||||
Marian Cepok <marian.cepok@gmail.com>
|
||||
Mark Fairbairn <thebaron88@gmail.com>
|
||||
Marko Tasic <mtasic85@gmail.com>
|
||||
Martin Krasser <krasserm@googlemail.com>
|
||||
Martin Schwaighofer <mschwaig@users.noreply.github.com>
|
||||
Marvin Gießing <marvin.giessing@gmail.com>
|
||||
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
|
||||
Matheus C. França <matheus-catarino@hotmail.com>
|
||||
Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
|
||||
Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
|
||||
Mathijs de Bruin <mathijs@mathijsfietst.nl>
|
||||
Matt Clayton <156335168+mattjcly@users.noreply.github.com>
|
||||
Matt Pulver <matt.pulver@heavy.ai>
|
||||
Matteo Boschini <12133566+mbosc@users.noreply.github.com>
|
||||
Matthew Tejo <matthew.tejo@gmail.com>
|
||||
Matvey Soloviev <blackhole89@gmail.com>
|
||||
Maxime <672982+maximegmd@users.noreply.github.com>
|
||||
Maximilian Winter <maximilian.winter.91@gmail.com>
|
||||
Meng Zhang <meng@tabbyml.com>
|
||||
Meng, Hengyu <hengyu.meng@intel.com>
|
||||
Merrick Christensen <merrick.christensen@gmail.com>
|
||||
Michael Coppola <m18coppola@gmail.com>
|
||||
Michael Hueschen <m@mhueschen.dev>
|
||||
Michael Kesper <mkesper@schokokeks.org>
|
||||
Michael Klimenko <mklimenko29@gmail.com>
|
||||
Michael Podvitskiy <podvitskiymichael@gmail.com>
|
||||
Michael Potter <NanoTekGuy@Gmail.com>
|
||||
Michaël de Vries <vriesdemichael@gmail.com>
|
||||
Mihai <mihai.chirculescu@yahoo.com>
|
||||
Mike <ytianhui2004@gmail.com>
|
||||
Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
|
||||
Mirko185 <mirkosig@gmail.com>
|
||||
Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
|
||||
Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
|
||||
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
|
||||
Murilo Santana <mvrilo@gmail.com>
|
||||
Musab Gultekin <musabgultekin@users.noreply.github.com>
|
||||
Nam D. Tran <42194884+namtranase@users.noreply.github.com>
|
||||
NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
|
||||
Nebula <infinitewormhole@gmail.com>
|
||||
Neo Zhang Jianyu <jianyu.zhang@intel.com>
|
||||
Neuman Vong <neuman.vong@gmail.com>
|
||||
Nexesenex <124105151+Nexesenex@users.noreply.github.com>
|
||||
Niall Coates <1349685+Niall-@users.noreply.github.com>
|
||||
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
|
||||
Nigel Bosch <pnigelb@gmail.com>
|
||||
Niklas Korz <niklas@niklaskorz.de>
|
||||
Nindaleth <Nindaleth@users.noreply.github.com>
|
||||
Oleksandr Nikitin <oleksandr@tvori.info>
|
||||
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
|
||||
Olivier Chafik <ochafik@users.noreply.github.com>
|
||||
Ondřej Čertík <ondrej@certik.us>
|
||||
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
|
||||
Paul Tsochantaris <ptsochantaris@icloud.com>
|
||||
Pavol Rusnak <pavol@rusnak.io>
|
||||
Pedro Cuenca <pedro@huggingface.co>
|
||||
Peter Sugihara <peter@campsh.com>
|
||||
Phil H <5756783+phiharri@users.noreply.github.com>
|
||||
Philip Taron <philip.taron@gmail.com>
|
||||
Phillip Kravtsov <phillip@kravtsov.net>
|
||||
Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
|
||||
Pierrick Hymbert <pierrick.hymbert@gmail.com>
|
||||
Przemysław Pawełczyk <przemoc@gmail.com>
|
||||
Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
|
||||
Qingyou Meng <meng.qingyou@gmail.com>
|
||||
Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
|
||||
RJ Adriaansen <adriaansen@eshcc.eur.nl>
|
||||
Radoslav Gerganov <rgerganov@gmail.com>
|
||||
Radosław Gryta <radek.gryta@gmail.com>
|
||||
Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com>
|
||||
Rand Xie <randxiexyy29@gmail.com>
|
||||
Randall Fitzgerald <randall@dasaku.net>
|
||||
Reinforce-II <fate@eastal.com>
|
||||
Riceball LEE <snowyu.lee@gmail.com>
|
||||
Richard Kiss <him@richardkiss.com>
|
||||
Richard Roberson <richardr1126@gmail.com>
|
||||
Rick G <26732651+TheFlipbook@users.noreply.github.com>
|
||||
Rickard Edén <rickardeden@gmail.com>
|
||||
Rickard Hallerbäck <rickard.hallerback@gmail.com>
|
||||
Rickey Bowers Jr <bitRAKE@gmail.com>
|
||||
Riley Stewart <ristew@users.noreply.github.com>
|
||||
Rinne <AsakusaRinne@gmail.com>
|
||||
Rinne <liu_yaohui1998@126.com>
|
||||
Robert Brisita <986796+rbrisita@users.noreply.github.com>
|
||||
Robert Sung-wook Shin <edp1096@users.noreply.github.com>
|
||||
Robey Holderith <robey@flaminglunchbox.net>
|
||||
Robyn <robyngraf@users.noreply.github.com>
|
||||
Roger Meier <r.meier@siemens.com>
|
||||
Roland <14355895+rbur0425@users.noreply.github.com>
|
||||
Romain D <90720+Artefact2@users.noreply.github.com>
|
||||
Romain Neutron <romain@neutron.io>
|
||||
Roman Parykin <donderom@gmail.com>
|
||||
Ron Evans <ron@hybridgroup.com>
|
||||
Ron Jailall <rojailal@gmail.com>
|
||||
Ronny Brendel <ronnybrendel@gmail.com>
|
||||
Ronsor <ronsor@ronsor.pw>
|
||||
Rowan Hart <rowanbhart@gmail.com>
|
||||
Rune <43761327+Rune-AI@users.noreply.github.com>
|
||||
Ryan Landay <rlanday@gmail.com>
|
||||
Ryder Wishart <ryderwishart@gmail.com>
|
||||
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
|
||||
SakuraUmi <yukinon244@gmail.com>
|
||||
Salvador E. Tropea <stropea@inti.gob.ar>
|
||||
Sam Spilsbury <smspillaz@gmail.com>
|
||||
Sami Farin <3876865+Safari77@users.noreply.github.com>
|
||||
Samuel Maynard <samwmaynard@gmail.com>
|
||||
Sang-Kil Park <sang.park@42dot.ai>
|
||||
Seb C <47074056+Sebby37@users.noreply.github.com>
|
||||
Sebastián A <sebastian.aedo29@gmail.com>
|
||||
SebastianApel <13675545+SebastianApel@users.noreply.github.com>
|
||||
Senemu <10880819+Senemu@users.noreply.github.com>
|
||||
Sergey Alirzaev <zl29ah@gmail.com>
|
||||
Sergio López <slp@sinrega.org>
|
||||
SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
|
||||
ShadovvBeast <ShadovvBeast@gmail.com>
|
||||
Shakhar Dasgupta <shakhardasgupta@gmail.com>
|
||||
Shangning Xu <32517059+xushangning@users.noreply.github.com>
|
||||
Shijie <821898965@qq.com>
|
||||
Shintarou Okada <kokuzen@gmail.com>
|
||||
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
|
||||
Shouzheng Liu <lshzh.hi@gmail.com>
|
||||
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
|
||||
Simon Willison <swillison@gmail.com>
|
||||
Siwen Yu <yusiwen@gmail.com>
|
||||
Sky Yan <skyan83@gmail.com>
|
||||
Slaren <2141330+slaren@users.noreply.github.com>
|
||||
Slava Primenko <primenko.s@gmail.com>
|
||||
SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
|
||||
Someone <sergei.kozlukov@aalto.fi>
|
||||
Someone Serge <sergei.kozlukov@aalto.fi>
|
||||
Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
|
||||
Spencer Sutton <spencersutton@users.noreply.github.com>
|
||||
Srinivas Billa <nivibilla@gmail.com>
|
||||
Stefan Sydow <stefan@sydow.email>
|
||||
Stephan Walter <stephan@walter.name>
|
||||
Stephen Nichols <snichols@users.noreply.github.com>
|
||||
Steve Grubb <ausearch.1@gmail.com>
|
||||
Steven Roussey <sroussey@gmail.com>
|
||||
Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
|
||||
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
|
||||
SuperUserNameMan <yoann@terminajones.com>
|
||||
Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
|
||||
Taikono-Himazin <kazu@po.harenet.ne.jp>
|
||||
Tameem <113388789+AhmadTameem@users.noreply.github.com>
|
||||
Tamotsu Takahashi <ttakah+github@gmail.com>
|
||||
Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com>
|
||||
Thatcher Chamberlin <j.thatcher.c@gmail.com>
|
||||
Theia Vogel <theia@vgel.me>
|
||||
Thérence <13496987+Royalphax@users.noreply.github.com>
|
||||
Thibault Terrasson <thibault.terrasson@gmail.com>
|
||||
Thomas Klausner <wiz@gatalith.at>
|
||||
Tim Miller <drasticactions@users.noreply.github.com>
|
||||
Timmy Knight <r2d2fish@gmail.com>
|
||||
Timothy Cronin <40186632+4imothy@users.noreply.github.com>
|
||||
Ting Lou <ting.lou@gmail.com>
|
||||
Ting Sun <suntcrick@gmail.com>
|
||||
Tobias Lütke <tobi@shopify.com>
|
||||
Tom C <tom.corelis@gmail.com>
|
||||
Tom Jobbins <784313+TheBloke@users.noreply.github.com>
|
||||
Tomas <tom.tomas.36478119@gmail.com>
|
||||
Tomáš Pazdiora <tomas.pazdiora@gmail.com>
|
||||
Tristan Ross <rosscomputerguy@protonmail.com>
|
||||
Tungsten842 <886724vf@anonaddy.me>
|
||||
Tungsten842 <quantmint@protonmail.com>
|
||||
Tushar <ditsuke@protonmail.com>
|
||||
UEXTM.com <84163508+uextm@users.noreply.github.com>
|
||||
Uzo Nweke <uzoechi@gmail.com>
|
||||
Vaibhav Srivastav <vaibhavs10@gmail.com>
|
||||
Val Kharitonov <mail@kharvd.com>
|
||||
Valentin Konovalov <valle.ketsujin@gmail.com>
|
||||
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
|
||||
Victor Z. Peng <ziliangdotme@gmail.com>
|
||||
Vlad <spitfireage@gmail.com>
|
||||
Vladimir <bogdad@gmail.com>
|
||||
Vladimir Malyutin <first-leon@yandex.ru>
|
||||
Vladimir Zorin <vladimir@deviant.guru>
|
||||
Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
|
||||
WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
|
||||
Weird Constructor <weirdconstructor@gmail.com>
|
||||
Welby Seely <welbyseely@gmail.com>
|
||||
Wentai Zhang <rchardx@gmail.com>
|
||||
WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
|
||||
Willy Tarreau <w@1wt.eu>
|
||||
Wu Jian Ping <wujjpp@hotmail.com>
|
||||
Wu Jian Ping <wujp@greatld.com>
|
||||
Xiake Sun <xiake.sun@intel.com>
|
||||
Xiang (Kevin) Li <kevinli020508@gmail.com>
|
||||
Xiao-Yong Jin <jinxiaoyong@gmail.com>
|
||||
XiaotaoChen <chenxiaotao1234@gmail.com>
|
||||
Xiaoyi Chen <cxychina@gmail.com>
|
||||
Xingchen Song(宋星辰) <xingchensong1996@163.com>
|
||||
Xuan Son Nguyen <thichthat@gmail.com>
|
||||
Yann Follet <131855179+YannFollet@users.noreply.github.com>
|
||||
Yiming Cui <conandiy@vip.qq.com>
|
||||
Yishuo Wang <MeouSker77@outlook.com>
|
||||
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
|
||||
Yui <dev@sleepyyui.com>
|
||||
Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
|
||||
Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
|
||||
ZHAOKAI WANG <sanxianwei@163.com>
|
||||
Zane Shannon <z@zcs.me>
|
||||
Zay <95888118+isaiahbjork@users.noreply.github.com>
|
||||
Zenix <zenixls2@gmail.com>
|
||||
Zhang Peiyuan <a1286225768@gmail.com>
|
||||
ZhouYuChen <zhouyuchen@naver.com>
|
||||
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
|
||||
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
|
||||
Zsapi <martin1.zsapka@gmail.com>
|
||||
a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
|
||||
adel boussaken <netdur@gmail.com>
|
||||
afrideva <95653597+afrideva@users.noreply.github.com>
|
||||
akawrykow <142945436+akawrykow@users.noreply.github.com>
|
||||
alexpinel <93524949+alexpinel@users.noreply.github.com>
|
||||
alonfaraj <alonfaraj@gmail.com>
|
||||
andrijdavid <david@geek.mg>
|
||||
anon998 <131767832+anon998@users.noreply.github.com>
|
||||
anzz1 <anzz1@live.com>
|
||||
apaz <aarpazdera@gmail.com>
|
||||
apcameron <37645737+apcameron@users.noreply.github.com>
|
||||
arcrank <arcrank@gmail.com>
|
||||
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
|
||||
at8u <129688334+at8u@users.noreply.github.com>
|
||||
automaticcat <daogiatuank54@gmail.com>
|
||||
bandoti <141645996+bandoti@users.noreply.github.com>
|
||||
beiller <beiller@gmail.com>
|
||||
bhubbb <79117352+bhubbb@users.noreply.github.com>
|
||||
bmwl <brian.marshall@tolko.com>
|
||||
bobqianic <129547291+bobqianic@users.noreply.github.com>
|
||||
bryanSwk <93190252+bryanSwk@users.noreply.github.com>
|
||||
bsilvereagle <bsilvereagle@users.noreply.github.com>
|
||||
bssrdf <merlintiger@hotmail.com>
|
||||
byte-6174 <88070277+byte-6174@users.noreply.github.com>
|
||||
cebtenzzre <cebtenzzre@gmail.com>
|
||||
chaihahaha <chai836275709@gmail.com>
|
||||
chiranko <96988916+chiranko@users.noreply.github.com>
|
||||
clibdev <52199778+clibdev@users.noreply.github.com>
|
||||
clyang <clyang@clyang.net>
|
||||
cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
|
||||
coezbek <c.oezbek@gmail.com>
|
||||
comex <comexk@gmail.com>
|
||||
compilade <113953597+compilade@users.noreply.github.com>
|
||||
crasm <crasm@git.vczf.net>
|
||||
crasm <crasm@git.vczf.us>
|
||||
daboe01 <daboe01@googlemail.com>
|
||||
david raistrick <keen99@users.noreply.github.com>
|
||||
ddpasa <112642920+ddpasa@users.noreply.github.com>
|
||||
deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
|
||||
divinity76 <divinity76@gmail.com>
|
||||
dotpy314 <33351922+dotpy314@users.noreply.github.com>
|
||||
drbh <david.richard.holtz@gmail.com>
|
||||
ds5t5 <145942675+ds5t5@users.noreply.github.com>
|
||||
dylan <canardleteer@users.noreply.github.com>
|
||||
eastriver <lee@eastriver.dev>
|
||||
ebraminio <ebraminio@gmail.com>
|
||||
eiery <19350831+eiery@users.noreply.github.com>
|
||||
eric8607242 <e0928021388@gmail.com>
|
||||
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
|
||||
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
|
||||
gliptic <gliptic@users.noreply.github.com>
|
||||
goerch <jhr.walter@t-online.de>
|
||||
grahameth <96447521+grahameth@users.noreply.github.com>
|
||||
gwjr <502526+gwjr@users.noreply.github.com>
|
||||
h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
|
||||
hankcs <cnhankmc@gmail.com>
|
||||
hoangmit <hoangmit@users.noreply.github.com>
|
||||
hongbo.mo <352280764@qq.com>
|
||||
howlger <eclipse@voormann.de>
|
||||
howlger <github@voormann.de>
|
||||
hutli <6594598+hutli@users.noreply.github.com>
|
||||
hutli <hutli@hutli.hu>
|
||||
hutli <jensstaermose@hotmail.com>
|
||||
hxer7963 <hxer7963@gmail.com>
|
||||
hydai <z54981220@gmail.com>
|
||||
iSma <ismail.senhaji@gmail.com>
|
||||
iacore <74560659+iacore@users.noreply.github.com>
|
||||
igarnier <igarnier@protonmail.com>
|
||||
iohub <rickyang.pro@gmail.com>
|
||||
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
|
||||
jameswu2014 <545426914@qq.com>
|
||||
jneem <joeneeman@gmail.com>
|
||||
johnson442 <56517414+johnson442@users.noreply.github.com>
|
||||
jon-chuang <9093549+jon-chuang@users.noreply.github.com>
|
||||
jp-x-g <jpxg-dev@protonmail.com>
|
||||
jwj7140 <32943891+jwj7140@users.noreply.github.com>
|
||||
kaizau <kaizau@users.noreply.github.com>
|
||||
kalomaze <66376113+kalomaze@users.noreply.github.com>
|
||||
kang <tpdns9032100@gmail.com>
|
||||
katsu560 <118887472+katsu560@users.noreply.github.com>
|
||||
kchro3 <62481661+kchro3@users.noreply.github.com>
|
||||
khimaros <me@khimaros.com>
|
||||
kiltyj <kiltyj@gmail.com>
|
||||
klosax <131523366+klosax@users.noreply.github.com>
|
||||
kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
|
||||
kunnis <kunnis@users.noreply.github.com>
|
||||
kuronekosaiko <EvanChanJ@163.com>
|
||||
kuvaus <22169537+kuvaus@users.noreply.github.com>
|
||||
kwin1412 <42286931+kwin1412@users.noreply.github.com>
|
||||
l3utterfly <gc.pthzfoldr@gmail.com>
|
||||
ldwang <ftgreat@163.com>
|
||||
le.chang <cljs118@126.com>
|
||||
leejet <leejet714@gmail.com>
|
||||
limitedAtonement <limitedAtonement@users.noreply.github.com>
|
||||
lon <114724657+longregen@users.noreply.github.com>
|
||||
m3ndax <adrian.goessl@outlook.com>
|
||||
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
|
||||
makomk <makosoft@googlemail.com>
|
||||
manikbhandari <mbbhandarimanik2@gmail.com>
|
||||
mdrokz <mohammadmunshi@gmail.com>
|
||||
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
|
||||
minarchist <minarchist@users.noreply.github.com>
|
||||
mj-shifu <77107165+mj-shifu@users.noreply.github.com>
|
||||
mmyjona <jonathan.gonse@gmail.com>
|
||||
momonga <115213907+mmnga@users.noreply.github.com>
|
||||
moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
|
||||
mzcu <milos.cubrilo@gmail.com>
|
||||
nanahi <130121847+na-na-hi@users.noreply.github.com>
|
||||
ngc92 <7938269+ngc92@users.noreply.github.com>
|
||||
nhamanasu <45545786+nhamanasu@users.noreply.github.com>
|
||||
niansa/tuxifan <anton-sa@web.de>
|
||||
niansa/tuxifan <tuxifan@posteo.de>
|
||||
ningshanwutuobang <ningshanwutuobang@gmail.com>
|
||||
nold <Nold360@users.noreply.github.com>
|
||||
nopperl <54780682+nopperl@users.noreply.github.com>
|
||||
nusu-github <29514220+nusu-github@users.noreply.github.com>
|
||||
olexiyb <olexiyb@gmail.com>
|
||||
oobabooga <112222186+oobabooga@users.noreply.github.com>
|
||||
opparco <parco.opaai@gmail.com>
|
||||
ostix360 <55257054+ostix360@users.noreply.github.com>
|
||||
perserk <perserk@gmail.com>
|
||||
postmasters <namnguyen@google.com>
|
||||
pudepiedj <pudepiedj@gmail.com>
|
||||
qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
|
||||
qouoq <qouoq@fastmail.com>
|
||||
qunash <anzoria@gmail.com>
|
||||
rabidcopy <rabidcopy@yahoo.com>
|
||||
rankaiyx <rankaiyx@rankaiyx.com>
|
||||
rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com>
|
||||
rhuddleston <ryan.huddleston@percona.com>
|
||||
rimoliga <53384203+rimoliga@users.noreply.github.com>
|
||||
runfuture <runfuture@users.noreply.github.com>
|
||||
sandyiscool <sandyiscool@gmail.com>
|
||||
semidark <me@semidark.net>
|
||||
sharpHL <132747147+sharpHL@users.noreply.github.com>
|
||||
shibe2 <shibe@tuta.io>
|
||||
singularity <12184989+singularity-s0@users.noreply.github.com>
|
||||
sjinzh <sjinzh@gmail.com>
|
||||
slaren <2141330+slaren@users.noreply.github.com>
|
||||
slaren <slarengh@gmail.com>
|
||||
snadampal <87143774+snadampal@users.noreply.github.com>
|
||||
staviq <staviq@gmail.com>
|
||||
stduhpf <stephduh@live.fr>
|
||||
swittk <switt1995@gmail.com>
|
||||
takov751 <40316768+takov751@users.noreply.github.com>
|
||||
tarcey <cey.tarik@gmail.com>
|
||||
texmex76 <40733439+texmex76@users.noreply.github.com>
|
||||
thement <40525767+thement@users.noreply.github.com>
|
||||
tjohnman <tjohnman@users.noreply.github.com>
|
||||
tslmy <tslmy@users.noreply.github.com>
|
||||
ubik2 <ubik2@users.noreply.github.com>
|
||||
uint256_t <konndennsa@gmail.com>
|
||||
uint256_t <maekawatoshiki1017@gmail.com>
|
||||
unbounded <haakon@likedan.net>
|
||||
valiray <133289098+valiray@users.noreply.github.com>
|
||||
vodkaslime <646329483@qq.com>
|
||||
vvhg1 <94630311+vvhg1@users.noreply.github.com>
|
||||
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
|
||||
wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
|
||||
whoreson <139810751+whoreson@users.noreply.github.com>
|
||||
wonjun Jang <strutive07@gmail.com>
|
||||
wzy <32936898+Freed-Wu@users.noreply.github.com>
|
||||
xaedes <xaedes@gmail.com>
|
||||
xaedes <xaedes@googlemail.com>
|
||||
xloem <0xloem@gmail.com>
|
||||
yangli2 <yangli2@gmail.com>
|
||||
yuiseki <yuiseki@gmail.com>
|
||||
zakkor <edward.partenie@gmail.com>
|
||||
zhouwg <6889919+zhouwg@users.noreply.github.com>
|
||||
zrm <trustiosity.zrm@gmail.com>
|
||||
源文雨 <41315874+fumiama@users.noreply.github.com>
|
||||
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
|
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2023 Georgi Gerganov
|
||||
Copyright (c) 2023-2024 The ggml authors
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
|
8
Makefile
8
Makefile
|
@ -1,7 +1,7 @@
|
|||
# Define the default target now so that it is always the first target
|
||||
BUILD_TARGETS = \
|
||||
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||
simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
|
||||
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
|
||||
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
||||
|
||||
# Binaries only useful for tests
|
||||
|
@ -646,7 +646,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])'
|
|||
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
||||
ifndef CUDA_DOCKER_ARCH
|
||||
ifndef CUDA_POWER_ARCH
|
||||
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
|
||||
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
|
||||
endif # CUDA_POWER_ARCH
|
||||
endif # CUDA_DOCKER_ARCH
|
||||
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
||||
|
@ -800,6 +800,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
|
|||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
|
|
@ -122,6 +122,8 @@ Typically finetunes of the base models below are supported as well.
|
|||
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
||||
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
||||
|
||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
|
||||
|
||||
**Multimodal models:**
|
||||
|
||||
- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
|
||||
|
@ -185,7 +187,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
|||
- [Dot](https://github.com/alexpinel/Dot) (GPL)
|
||||
- [MindMac](https://mindmac.app) (proprietary)
|
||||
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
||||
|
||||
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||
|
||||
---
|
||||
|
@ -496,7 +498,7 @@ Building the program with BLAS support may lead to some performance improvements
|
|||
|
||||
This provides BLAS acceleration on HIP-supported AMD GPUs.
|
||||
Make sure to have ROCm installed.
|
||||
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
|
||||
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
|
||||
|
||||
- Using `make`:
|
||||
```bash
|
||||
|
@ -513,7 +515,7 @@ Building the program with BLAS support may lead to some performance improvements
|
|||
|
||||
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
||||
```bash
|
||||
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030
|
||||
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
|
||||
```
|
||||
|
||||
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
||||
|
|
|
@ -1745,6 +1745,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|||
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
||||
cparams.pooling_type = params.pooling_type;
|
||||
cparams.defrag_thold = params.defrag_thold;
|
||||
cparams.cb_eval = params.cb_eval;
|
||||
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
||||
cparams.offload_kqv = !params.no_kv_offload;
|
||||
|
||||
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
||||
|
@ -2192,7 +2194,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|||
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
||||
}
|
||||
|
||||
{
|
||||
if (params.warmup) {
|
||||
LOG("warming up the model with an empty run\n");
|
||||
|
||||
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
||||
|
@ -2212,23 +2214,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|||
std::vector<llama_token> llama_tokenize(
|
||||
const struct llama_context * ctx,
|
||||
const std::string & text,
|
||||
bool add_bos,
|
||||
bool special) {
|
||||
return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
|
||||
bool add_special,
|
||||
bool parse_special) {
|
||||
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
|
||||
}
|
||||
|
||||
std::vector<llama_token> llama_tokenize(
|
||||
const struct llama_model * model,
|
||||
const std::string & text,
|
||||
bool add_bos,
|
||||
bool special) {
|
||||
bool add_special,
|
||||
bool parse_special) {
|
||||
// upper limit for the number of tokens
|
||||
int n_tokens = text.length() + add_bos;
|
||||
int n_tokens = text.length() + 2 * add_special;
|
||||
std::vector<llama_token> result(n_tokens);
|
||||
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
|
||||
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
|
||||
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
|
|
|
@ -80,6 +80,9 @@ struct gpt_params {
|
|||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||
void * cb_eval_user_data = nullptr;
|
||||
|
||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||
|
||||
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||
|
@ -156,6 +159,7 @@ struct gpt_params {
|
|||
bool infill = false; // use infill mode
|
||||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||
bool no_kv_offload = false; // disable KV offloading
|
||||
bool warmup = true; // warmup run
|
||||
|
||||
std::string cache_type_k = "f16"; // KV cache data type for the K
|
||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||
|
@ -223,14 +227,14 @@ void llama_batch_add(
|
|||
std::vector<llama_token> llama_tokenize(
|
||||
const struct llama_context * ctx,
|
||||
const std::string & text,
|
||||
bool add_bos,
|
||||
bool special = false);
|
||||
bool add_special,
|
||||
bool parse_special = false);
|
||||
|
||||
std::vector<llama_token> llama_tokenize(
|
||||
const struct llama_model * model,
|
||||
const std::string & text,
|
||||
bool add_bos,
|
||||
bool special = false);
|
||||
bool add_special,
|
||||
bool parse_special = false);
|
||||
|
||||
// tokenizes a token into a piece
|
||||
// should work similar to Python's `tokenizer.id_to_piece`
|
||||
|
|
|
@ -160,7 +160,7 @@ class Model(ABC):
|
|||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
|
@ -227,15 +227,14 @@ class Model(ABC):
|
|||
return ("pytorch_model.bin",)
|
||||
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
|
||||
|
||||
def _set_vocab_gpt2(self):
|
||||
dir_model = self.dir_model
|
||||
hparams = self.hparams
|
||||
# used for GPT-2 BPE and WordPiece vocabs
|
||||
def get_basic_vocab(self) -> tuple[list[str], list[int]]:
|
||||
tokens: list[str] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
|
||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
||||
|
@ -255,11 +254,15 @@ class Model(ABC):
|
|||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
return tokens, toktypes
|
||||
|
||||
def _set_vocab_gpt2(self) -> None:
|
||||
tokens, toktypes = self.get_basic_vocab()
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_qwen(self):
|
||||
|
@ -2043,34 +2046,25 @@ class BertModel(Model):
|
|||
self.gguf_writer.add_pooling_type(pooling_type)
|
||||
|
||||
def set_vocab(self):
|
||||
# use huggingface vocab to get all tokens
|
||||
vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
|
||||
tokens, scores, toktypes = zip(*vocab.all_tokens())
|
||||
assert len(tokens) == vocab.vocab_size
|
||||
self.vocab_size = vocab.vocab_size
|
||||
tokens, toktypes = self.get_basic_vocab()
|
||||
self.vocab_size = len(tokens)
|
||||
|
||||
# we need this to validate the size of the token_type embeddings
|
||||
# though currently we are passing all zeros to the token_type embeddings
|
||||
n_token_types = len(set(toktypes))
|
||||
self.gguf_writer.add_token_type_count(n_token_types)
|
||||
self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B"
|
||||
|
||||
# convert to phantom space vocab
|
||||
def phantom(tok, typ):
|
||||
if tok.startswith(b"[") and tok.endswith(b"]"):
|
||||
def phantom(tok):
|
||||
if tok.startswith("[") and tok.endswith("]"):
|
||||
return tok
|
||||
if tok.startswith(b"##"):
|
||||
if tok.startswith("##"):
|
||||
return tok[2:]
|
||||
return b"\xe2\x96\x81" + tok
|
||||
tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
|
||||
|
||||
# set up bos and eos tokens (cls and sep)
|
||||
self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
|
||||
self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id)
|
||||
return "\u2581" + tok
|
||||
tokens = list(map(phantom, tokens))
|
||||
|
||||
# add vocab to gguf
|
||||
self.gguf_writer.add_tokenizer_model("bert")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
# handle special tokens
|
||||
|
@ -2142,16 +2136,6 @@ class NomicBertModel(BertModel):
|
|||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
||||
|
||||
def get_tensors(self):
|
||||
assert self.vocab_size is not None
|
||||
for name, data in super().get_tensors():
|
||||
# Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
|
||||
if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
|
||||
rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
|
||||
assert data.shape == (rounded_vocab_size, self.hparams["n_embd"])
|
||||
data = data[:self.vocab_size, :]
|
||||
yield name, data
|
||||
|
||||
|
||||
@Model.register("GemmaForCausalLM")
|
||||
class GemmaModel(Model):
|
||||
|
@ -2327,7 +2311,8 @@ class MambaModel(Model):
|
|||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert big float32 2-dim weight tensors to float16
|
||||
if self.ftype == 1 and data_dtype == np.float32 and new_name.removesuffix(".weight").endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
|
||||
new_weight_name = new_name[:-len(".weight")] if new_name.endswith(".weight") else ""
|
||||
if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
|
26
convert.py
26
convert.py
|
@ -33,7 +33,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
|||
import gguf
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import TypeAlias
|
||||
from typing_extensions import Self, TypeAlias
|
||||
|
||||
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
|
||||
faulthandler.register(signal.SIGUSR1)
|
||||
|
@ -139,7 +139,8 @@ class GGMLFileType(enum.IntEnum):
|
|||
dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
|
||||
if dt is None:
|
||||
raise ValueError(self)
|
||||
# 1D tensors are always F32.
|
||||
# Convert all 1D tensors to F32. Most of the codebase that takes in 1D tensors only handles F32 tensors, and most of the outputs tensors are F32.
|
||||
# Also The 1d tensors aren't much of a performance/size issue. So instead of having to have separate F32 and F16 implementations of both, just convert everything to F32 for now.
|
||||
return dt if len(tensor.shape) > 1 else DT_F32
|
||||
|
||||
|
||||
|
@ -516,7 +517,7 @@ class LlamaHfVocab(Vocab):
|
|||
tokenizer_model = "llama"
|
||||
name = "hfft"
|
||||
|
||||
def __init__(self, base_path: Path, ignore_nonllama: bool = False):
|
||||
def __init__(self, base_path: Path):
|
||||
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
||||
# if this fails, FileNotFoundError propagates to caller
|
||||
with open(fname_tokenizer, encoding='utf-8') as f:
|
||||
|
@ -524,9 +525,7 @@ class LlamaHfVocab(Vocab):
|
|||
|
||||
# pre-check so we know if we need transformers
|
||||
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
||||
if ignore_nonllama:
|
||||
pass # workaround incorrect use of this class for WordPiece
|
||||
elif (
|
||||
if (
|
||||
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
|
||||
or tokenizer_json['decoder']['type'] != 'Sequence'
|
||||
):
|
||||
|
@ -646,16 +645,17 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
|||
|
||||
|
||||
class Tensor(ABC):
|
||||
ndarray: NDArray
|
||||
data_type: DataType
|
||||
|
||||
@abstractmethod
|
||||
def astype(self, data_type: DataType) -> Tensor: ...
|
||||
def astype(self, data_type: DataType) -> Self: ...
|
||||
@abstractmethod
|
||||
def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
|
||||
def permute(self, n_head: int, n_head_kv: int) -> Self: ...
|
||||
@abstractmethod
|
||||
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
|
||||
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> Self: ...
|
||||
@abstractmethod
|
||||
def part(self, n_part: int) -> UnquantizedTensor: ...
|
||||
def part(self, n_part: int) -> Self: ...
|
||||
@abstractmethod
|
||||
def to_ggml(self) -> GGMLCompatibleTensor: ...
|
||||
|
||||
|
@ -672,13 +672,13 @@ class UnquantizedTensor(Tensor):
|
|||
self.ndarray = ndarray
|
||||
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
|
||||
|
||||
def astype(self, data_type: DataType) -> Tensor:
|
||||
def astype(self, data_type: DataType) -> UnquantizedTensor:
|
||||
dtype = data_type.dtype
|
||||
if self.data_type == DT_BF16:
|
||||
self.ndarray = bf16_to_fp32(self.ndarray)
|
||||
return UnquantizedTensor(self.ndarray.astype(dtype))
|
||||
|
||||
def to_ggml(self) -> UnquantizedTensor:
|
||||
def to_ggml(self) -> Self:
|
||||
return self
|
||||
|
||||
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
|
||||
|
@ -1350,7 +1350,7 @@ def load_some_model(path: Path) -> ModelPlus:
|
|||
# Be extra-friendly and accept either a file or a directory:
|
||||
if path.is_dir():
|
||||
# Check if it's a set of safetensors files first
|
||||
globs = ["model-00001-of-*.safetensors", "model.safetensors"]
|
||||
globs = ["model-00001-of-*.safetensors", "model.safetensors", "consolidated.safetensors"]
|
||||
files = [file for glob in globs for file in path.glob(glob)]
|
||||
if not files:
|
||||
# Try the PyTorch patterns too, with lower priority
|
||||
|
|
119
docs/HOWTO-add-model.md
Normal file
119
docs/HOWTO-add-model.md
Normal file
|
@ -0,0 +1,119 @@
|
|||
## Add a new model architecture to `llama.cpp`
|
||||
|
||||
Adding a model requires few steps:
|
||||
|
||||
1. Convert the model to GGUF
|
||||
2. Define the model architecture in `llama.cpp`
|
||||
3. Build the GGML graph implementation
|
||||
|
||||
After following these steps, you can open PR.
|
||||
|
||||
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
|
||||
- [main](../examples/main)
|
||||
- [imatrix](../examples/imatrix)
|
||||
- [quantize](../examples/quantize)
|
||||
- [server](../examples/server)
|
||||
|
||||
### 1. Convert the model to GGUF
|
||||
|
||||
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
|
||||
Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
|
||||
|
||||
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
|
||||
|
||||
The required steps to implement for an HF model are:
|
||||
|
||||
1. Define the model `Model.register` annotation in a new `Model` subclass, example:
|
||||
|
||||
```python
|
||||
@Model.register("MyModelForCausalLM")
|
||||
class MyModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.GROK
|
||||
```
|
||||
|
||||
2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py)
|
||||
|
||||
Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.
|
||||
|
||||
Example for `falcon` model:
|
||||
```python
|
||||
MODEL_ARCH.FALCON: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_NORM_2,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
]
|
||||
```
|
||||
|
||||
3. Map the original tensor names to the standardize equivalent in GGUF
|
||||
|
||||
As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.
|
||||
|
||||
Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file.
|
||||
|
||||
If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.
|
||||
|
||||
Example for the normalization tensor in attention layers:
|
||||
|
||||
```python
|
||||
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||
# Attention norm
|
||||
MODEL_TENSOR.ATTN_NORM: (
|
||||
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
||||
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen
|
||||
"transformer.blocks.{bid}.norm_1", # mpt
|
||||
...
|
||||
)
|
||||
}
|
||||
```
|
||||
|
||||
`transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF.
|
||||
|
||||
Depending on the model configuration, tokenizer, code and tensors layout, you will have to override:
|
||||
- `Model#set_gguf_parameters`
|
||||
- `Model#set_vocab`
|
||||
- `Model#write_tensors`
|
||||
|
||||
NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights.
|
||||
|
||||
### 2. Define the model architecture in `llama.cpp`
|
||||
|
||||
The model params and tensors layout must be defined in `llama.cpp`:
|
||||
1. Define a new `llm_arch`
|
||||
2. Define the tensors layout in `LLM_TENSOR_NAMES`
|
||||
3. Add any non standard metadata in `llm_load_hparams`
|
||||
4. Create the tensors for inference in `llm_load_tensors`
|
||||
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
|
||||
|
||||
NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
|
||||
|
||||
### 3. Build the GGML graph implementation
|
||||
|
||||
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
|
||||
|
||||
Have a look to existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
|
||||
|
||||
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.
|
||||
|
||||
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
|
||||
|
||||
## GGUF specification
|
||||
|
||||
https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
|
||||
|
||||
## Resources
|
||||
|
||||
- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
|
||||
- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
|
||||
- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
|
||||
- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
|
||||
- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
|
||||
- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
|
||||
- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
|
||||
- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
|
||||
- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948
|
|
@ -19,6 +19,7 @@ else()
|
|||
add_subdirectory(benchmark)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(eval-callback)
|
||||
add_subdirectory(finetune)
|
||||
add_subdirectory(gritlm)
|
||||
add_subdirectory(gguf-split)
|
||||
|
|
|
@ -123,10 +123,10 @@ int main(int argc, char ** argv) {
|
|||
inputs.push_back(inp);
|
||||
}
|
||||
|
||||
// add eos if not present
|
||||
// add SEP if not present
|
||||
for (auto & inp : inputs) {
|
||||
if (inp.empty() || inp.back() != llama_token_eos(model)) {
|
||||
inp.push_back(llama_token_eos(model));
|
||||
if (inp.empty() || inp.back() != llama_token_sep(model)) {
|
||||
inp.push_back(llama_token_sep(model));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
9
examples/eval-callback/CMakeLists.txt
Normal file
9
examples/eval-callback/CMakeLists.txt
Normal file
|
@ -0,0 +1,9 @@
|
|||
set(TARGET eval-callback)
|
||||
add_executable(${TARGET} eval-callback.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
set(TEST_TARGET test-eval-callback)
|
||||
add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
||||
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
95
examples/eval-callback/README.md
Normal file
95
examples/eval-callback/README.md
Normal file
|
@ -0,0 +1,95 @@
|
|||
# llama.cpp/examples/eval-callback
|
||||
|
||||
A simple example which demonstrates how to use callback during the inference.
|
||||
It simply prints to the console all operations and tensor data.
|
||||
|
||||
Usage:
|
||||
|
||||
```shell
|
||||
eval-callback \
|
||||
--hf-repo ggml-org/models \
|
||||
--hf-file phi-2/ggml-model-q4_0.gguf \
|
||||
--model phi-2-q4_0.gguf \
|
||||
--prompt hello \
|
||||
--seed 42 \
|
||||
-ngl 33
|
||||
```
|
||||
|
||||
Will print:
|
||||
|
||||
```shell
|
||||
llm_load_tensors: offloaded 33/33 layers to GPU
|
||||
...
|
||||
llama_new_context_with_model: n_ctx = 512
|
||||
...
|
||||
llama_new_context_with_model: CUDA0 compute buffer size = 105.00 MiB
|
||||
llama_new_context_with_model: CUDA_Host compute buffer size = 6.01 MiB
|
||||
llama_new_context_with_model: graph nodes = 1225
|
||||
llama_new_context_with_model: graph splits = 2
|
||||
ggml_debug: inp_embd = (f32) GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1}
|
||||
[
|
||||
[
|
||||
[ -0.0181, 0.0272, 0.0272, ...],
|
||||
],
|
||||
]
|
||||
ggml_debug: norm-0 = (f32) NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
|
||||
[
|
||||
[
|
||||
[ -0.6989, 1.0636, 1.0636, ...],
|
||||
],
|
||||
]
|
||||
ggml_debug: norm_w-0 = (f32) MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
|
||||
[
|
||||
[
|
||||
[ -0.1800, 0.2817, 0.2632, ...],
|
||||
],
|
||||
]
|
||||
ggml_debug: attn_norm-0 = (f32) ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
|
||||
[
|
||||
[
|
||||
[ -0.1863, 0.2970, 0.2604, ...],
|
||||
],
|
||||
]
|
||||
ggml_debug: wqkv-0 = (f32) MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1}
|
||||
[
|
||||
[
|
||||
[ -1.1238, 1.2876, -1.8086, ...],
|
||||
],
|
||||
]
|
||||
ggml_debug: bqkv-0 = (f32) ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1}
|
||||
[
|
||||
[
|
||||
[ -1.1135, 1.4604, -1.9226, ...],
|
||||
],
|
||||
]
|
||||
ggml_debug: bqkv-0 (view) = (f32) VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1}
|
||||
[
|
||||
[
|
||||
[ -1.1135, 1.4604, -1.9226, ...],
|
||||
],
|
||||
]
|
||||
ggml_debug: Qcur-0 = (f32) CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
|
||||
[
|
||||
[
|
||||
[ -1.1135, 1.4604, -1.9226, ...],
|
||||
],
|
||||
]
|
||||
ggml_debug: Qcur-0 (reshaped) = (f32) RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1}
|
||||
[
|
||||
[
|
||||
[ -1.1135, 1.4604, -1.9226, ...],
|
||||
[ -0.3608, 0.5076, -1.8866, ...],
|
||||
[ 1.7643, 0.0273, -2.1065, ...],
|
||||
...
|
||||
],
|
||||
]
|
||||
ggml_debug: Qcur-0 = (f32) ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1}
|
||||
[
|
||||
[
|
||||
[ -1.1135, 1.4604, -1.9226, ...],
|
||||
[ -0.3608, 0.5076, -1.8866, ...],
|
||||
[ 1.7643, 0.0273, -2.1065, ...],
|
||||
...
|
||||
],
|
||||
]
|
||||
```
|
185
examples/eval-callback/eval-callback.cpp
Normal file
185
examples/eval-callback/eval-callback.cpp
Normal file
|
@ -0,0 +1,185 @@
|
|||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
/**
|
||||
* This the arbitrary data which will be passed to each callback.
|
||||
* Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
|
||||
*/
|
||||
struct callback_data {
|
||||
std::vector<uint8_t> data;
|
||||
};
|
||||
|
||||
static std::string ggml_ne_string(const ggml_tensor * t) {
|
||||
std::string str;
|
||||
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||
str += std::to_string(t->ne[i]);
|
||||
if (i + 1 < GGML_MAX_DIMS) {
|
||||
str += ", ";
|
||||
}
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
|
||||
float sum = 0;
|
||||
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||
printf(" [\n");
|
||||
for (int64_t i2 = 0; i2 < ne[2] && i2 < n; i2++) {
|
||||
printf(" [\n");
|
||||
for (int64_t i1 = 0; i1 < ne[1] && i1 < n; i1++) {
|
||||
printf(" [");
|
||||
for (int64_t i0 = 0; i0 < ne[0] && i0 < n; i0++) {
|
||||
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||
float v;
|
||||
if (type == GGML_TYPE_F16) {
|
||||
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
|
||||
} else if (type == GGML_TYPE_F32) {
|
||||
v = *(float *) data + i;
|
||||
} else if (type == GGML_TYPE_I32) {
|
||||
v = (float) *(int32_t *) data + i;
|
||||
} else if (type == GGML_TYPE_I16) {
|
||||
v = (float) *(int16_t *) data + i;
|
||||
} else if (type == GGML_TYPE_I8) {
|
||||
v = (float) *(int8_t *) data + i;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
printf("%8.4f", v);
|
||||
sum += v;
|
||||
if (i0 < ne[0] - 1 && i0 < n - 1) printf(", ");
|
||||
}
|
||||
if (ne[0] > n) printf(", ...");
|
||||
printf("],\n");
|
||||
}
|
||||
if (ne[1] > n) printf(" ...\n");
|
||||
printf(" ],\n");
|
||||
}
|
||||
if (ne[2] > n) printf(" ...\n");
|
||||
printf(" ]\n");
|
||||
printf(" sum = %f\n", sum);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* GGML operations callback during the graph execution.
|
||||
*
|
||||
* @param t current tensor
|
||||
* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
||||
* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
|
||||
* see ggml_backend_sched_eval_callback
|
||||
* @param user_data user data to pass at each call back
|
||||
* @return true to receive data or continue the graph, false otherwise
|
||||
*/
|
||||
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
auto * cb_data = (callback_data *) user_data;
|
||||
|
||||
const struct ggml_tensor * src0 = t->src[0];
|
||||
const struct ggml_tensor * src1 = t->src[1];
|
||||
|
||||
if (ask) {
|
||||
return true; // Always retrieve data
|
||||
}
|
||||
|
||||
char src1_str[128] = {0};
|
||||
if (src1) {
|
||||
sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
||||
}
|
||||
|
||||
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
|
||||
t->name, ggml_type_name(t->type), ggml_op_name(t->op),
|
||||
src0->name, ggml_ne_string(src0).c_str(),
|
||||
src1 ? src1_str : "",
|
||||
ggml_ne_string(t).c_str());
|
||||
|
||||
|
||||
// copy the data from the GPU memory if needed
|
||||
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
|
||||
|
||||
if (!is_host) {
|
||||
auto n_bytes = ggml_nbytes(t);
|
||||
cb_data->data.resize(n_bytes);
|
||||
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
|
||||
}
|
||||
|
||||
if (!ggml_is_quantized(t->type)) {
|
||||
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
|
||||
ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool run(llama_context * ctx, const gpt_params & params) {
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
|
||||
callback_data cb_data;
|
||||
|
||||
gpt_params params;
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
print_build_info();
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
if (params.random_prompt) {
|
||||
params.prompt = gpt_random_prompt(rng);
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
// pass the callback to the backend scheduler
|
||||
// it will be executed for each node during the graph computation
|
||||
params.cb_eval = ggml_debug;
|
||||
params.cb_eval_user_data = &cb_data;
|
||||
params.warmup = false;
|
||||
|
||||
// init
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
if (model == nullptr || ctx == nullptr) {
|
||||
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
bool OK = run(ctx, params);
|
||||
if (!OK) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_print_timings(ctx);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -17,7 +17,7 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
|
|||
size_t pos = 0;
|
||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||
auto prev_stacks = grammar->stacks;
|
||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
||||
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
|
||||
if (grammar->stacks.empty()) {
|
||||
error_pos = pos;
|
||||
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
|
||||
|
|
|
@ -142,7 +142,7 @@ static bool gguf_ex_read_0(const std::string & fname) {
|
|||
}
|
||||
|
||||
// read and create ggml_context containing the tensors and their data
|
||||
static bool gguf_ex_read_1(const std::string & fname) {
|
||||
static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
||||
struct ggml_context * ctx_data = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
|
@ -206,7 +206,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
|
|||
printf("\n\n");
|
||||
|
||||
// check data
|
||||
{
|
||||
if (check_data) {
|
||||
const float * data = (const float *) cur->data;
|
||||
for (int j = 0; j < ggml_nelements(cur); ++j) {
|
||||
if (data[j] != 100 + i) {
|
||||
|
@ -229,9 +229,16 @@ static bool gguf_ex_read_1(const std::string & fname) {
|
|||
|
||||
int main(int argc, char ** argv) {
|
||||
if (argc < 3) {
|
||||
printf("usage: %s data.gguf r|w\n", argv[0]);
|
||||
printf("usage: %s data.gguf r|w [n]\n", argv[0]);
|
||||
printf("r: read data.gguf file\n");
|
||||
printf("w: write data.gguf file\n");
|
||||
printf("n: no check of tensor data\n");
|
||||
return -1;
|
||||
}
|
||||
bool check_data = true;
|
||||
if (argc == 4) {
|
||||
check_data = false;
|
||||
}
|
||||
|
||||
const std::string fname(argv[1]);
|
||||
const std::string mode (argv[2]);
|
||||
|
@ -242,7 +249,7 @@ int main(int argc, char ** argv) {
|
|||
GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
|
||||
} else if (mode == "r") {
|
||||
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
|
||||
GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
|
||||
GGML_ASSERT(gguf_ex_read_1(fname, check_data) && "failed to read gguf file");
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -349,12 +349,13 @@ static void process_logits(
|
|||
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
|
||||
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||
|
@ -596,24 +597,18 @@ int main(int argc, char ** argv) {
|
|||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
llama_model_params mparams = llama_model_params_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
||||
|
||||
// pass the callback to the backend scheduler
|
||||
// it will be executed for each node during the graph computation
|
||||
cparams.cb_eval = ik_collect_imatrix;
|
||||
cparams.cb_eval_user_data = NULL;
|
||||
params.cb_eval = ik_collect_imatrix;
|
||||
params.cb_eval_user_data = NULL;
|
||||
params.warmup = false;
|
||||
|
||||
llama_context * ctx = llama_new_context_with_model(model, cparams);
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to create context\n", __func__);
|
||||
// init
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
if (model == nullptr || ctx == nullptr) {
|
||||
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -239,6 +239,7 @@ int main(int argc, char ** argv) {
|
|||
LOG_TEE("%s\n", get_system_info(params).c_str());
|
||||
}
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||
LOG("add_bos: %d\n", add_bos);
|
||||
|
||||
bool suff_rm_leading_spc = params.escape;
|
||||
|
@ -279,10 +280,10 @@ int main(int argc, char ** argv) {
|
|||
if (ctx_guidance) {
|
||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
||||
|
||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
|
||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
|
||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
||||
|
||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
||||
|
||||
original_prompt_len = original_inp.size();
|
||||
|
|
|
@ -146,7 +146,6 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||
int n_past = 0;
|
||||
|
||||
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
|
||||
|
||||
std::string system_prompt, user_prompt;
|
||||
size_t image_pos = prompt.find("<image>");
|
||||
|
@ -180,7 +179,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||
}
|
||||
}
|
||||
|
||||
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
|
||||
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
|
||||
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
|
||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||
|
||||
|
|
|
@ -64,13 +64,10 @@ int main(int argc, char ** argv) {
|
|||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
|
||||
// Tokenize the prompt
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
LOG("add_bos tgt: %d\n", add_bos);
|
||||
|
||||
std::vector<llama_token> inp;
|
||||
std::vector<llama_token> all;
|
||||
|
||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||
all = inp;
|
||||
|
||||
const int max_context_size = llama_n_ctx(ctx);
|
||||
|
|
|
@ -28,10 +28,8 @@ int main(int argc, char ** argv){
|
|||
GGML_ASSERT(model != nullptr);
|
||||
|
||||
// tokenize the prompt
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||
fprintf(stderr, "%s: tokenization done\n", __func__);
|
||||
|
||||
|
||||
|
|
|
@ -34,11 +34,8 @@ int main(int argc, char ** argv){
|
|||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||
|
||||
// tokenize the prompt
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
LOG("add_bos tgt: %d\n", add_bos);
|
||||
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||
|
||||
llama_ngram_cache ngram_cache_context;
|
||||
llama_ngram_cache ngram_cache_dynamic;
|
||||
|
|
|
@ -42,11 +42,8 @@ int main(int argc, char ** argv){
|
|||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||
|
||||
// tokenize the prompt
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
LOG("add_bos tgt: %d\n", add_bos);
|
||||
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||
|
||||
llama_ngram_cache ngram_cache_context;
|
||||
llama_ngram_cache ngram_cache_dynamic;
|
||||
|
|
|
@ -246,6 +246,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||
LOG("add_bos: %d\n", add_bos);
|
||||
|
||||
std::vector<llama_token> embd_inp;
|
||||
|
@ -255,7 +256,7 @@ int main(int argc, char ** argv) {
|
|||
if (params.chatml) {
|
||||
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
|
||||
}
|
||||
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||
} else {
|
||||
LOG("use session tokens\n");
|
||||
embd_inp = session_tokens;
|
||||
|
@ -277,10 +278,10 @@ int main(int argc, char ** argv) {
|
|||
if (ctx_guidance) {
|
||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
||||
|
||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
|
||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
|
||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
||||
|
||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
||||
|
||||
original_prompt_len = original_inp.size();
|
||||
|
@ -339,14 +340,14 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// prefix & suffix for instruct mode
|
||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
|
||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
|
||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true);
|
||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
|
||||
|
||||
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
||||
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
||||
|
||||
// chatml prefix & suffix
|
||||
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true);
|
||||
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
|
||||
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
|
||||
|
||||
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
|
||||
|
|
|
@ -315,10 +315,11 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|||
// BOS tokens will be added for each chunk before eval
|
||||
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||
|
||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
|
@ -454,6 +455,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||
// BOS tokens will be added for each chunk before eval
|
||||
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||
|
||||
std::ofstream logits_stream;
|
||||
if (!params.logits_file.empty()) {
|
||||
|
@ -470,7 +472,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||
|
@ -771,9 +773,6 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
|
||||
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
|
||||
|
||||
// This is needed as usual for LLaMA models
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
|
||||
// The tasks should be randomized so the score stabilizes quickly.
|
||||
bool randomize_tasks = true;
|
||||
|
||||
|
@ -818,7 +817,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||
hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
||||
for (size_t j = 0; j < 4; j++) {
|
||||
hs_cur.ending[j] = prompt_lines[idx*6+2+j];
|
||||
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], add_bos);
|
||||
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
|
||||
}
|
||||
|
||||
// determine the common prefix of the endings
|
||||
|
@ -837,7 +836,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||
hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
|
||||
hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
|
||||
|
||||
//GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, add_bos).size());
|
||||
//GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, true).size());
|
||||
|
||||
// Delete the selected random example from the prompt
|
||||
if (randomize_tasks) {
|
||||
|
@ -1110,12 +1109,9 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||
|
||||
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
|
||||
|
||||
// This is needed as usual for LLaMA models
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
|
||||
for (auto & task : data) {
|
||||
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, add_bos);
|
||||
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, add_bos);
|
||||
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
|
||||
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true);
|
||||
|
||||
task.common_prefix = 0;
|
||||
for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
|
||||
|
@ -1130,8 +1126,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||
task.seq_tokens[0].size() - task.common_prefix +
|
||||
task.seq_tokens[1].size() - task.common_prefix;
|
||||
|
||||
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], add_bos).size();
|
||||
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], add_bos).size();
|
||||
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size();
|
||||
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
|
||||
|
@ -1322,7 +1318,7 @@ struct multiple_choice_task {
|
|||
std::vector<float> log_probs;
|
||||
};
|
||||
|
||||
static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) {
|
||||
static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
|
||||
if (task.question.empty() || task.mc1.answers.empty()) {
|
||||
if (log_error) {
|
||||
printf("%s: found bad task with empty question and/or answers\n", __func__);
|
||||
|
@ -1337,7 +1333,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos,
|
|||
}
|
||||
return false;
|
||||
}
|
||||
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos));
|
||||
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true));
|
||||
}
|
||||
auto min_len = task.seq_tokens.front().size();
|
||||
for (auto& seq : task.seq_tokens) {
|
||||
|
@ -1436,9 +1432,6 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||
n_task = params.multiple_choice_tasks;
|
||||
}
|
||||
|
||||
// This is needed as usual for LLaMA models
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
|
||||
printf("%s: preparing task data", __func__);
|
||||
fflush(stdout);
|
||||
if (n_task > 500) {
|
||||
|
@ -1446,7 +1439,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||
fflush(stdout);
|
||||
std::atomic<int> counter(0);
|
||||
std::atomic<int> n_bad(0);
|
||||
auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () {
|
||||
auto prepare = [&counter, &n_bad, &tasks, ctx] () {
|
||||
int num_tasks = tasks.size();
|
||||
int n_bad_local = 0;
|
||||
while (true) {
|
||||
|
@ -1457,7 +1450,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||
}
|
||||
int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
|
||||
for (int i = first; i < last; ++i) {
|
||||
if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local;
|
||||
if (!multiple_choice_prepare_one_task(ctx, tasks[i], false)) ++n_bad_local;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -1479,7 +1472,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||
int i_task = 0;
|
||||
for (auto& task : tasks) {
|
||||
++i_task;
|
||||
if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) {
|
||||
if (!multiple_choice_prepare_one_task(ctx, task, true)) {
|
||||
return;
|
||||
}
|
||||
if (i_task%n_dot == 0) {
|
||||
|
@ -1715,6 +1708,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|||
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
|
||||
const int nv = 2*((n_vocab + 1)/2) + 4;
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||
|
||||
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
|
||||
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -51,6 +51,26 @@
|
|||
margin-bottom: 0.5em;
|
||||
}
|
||||
|
||||
button, input, textarea, .button, a.button, select {
|
||||
color: #666;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 4px;
|
||||
line-height: 1.5em;
|
||||
padding: 0.25em 0.25em;
|
||||
text-decoration: none;
|
||||
font-size: 1.1rem;
|
||||
}
|
||||
|
||||
button {
|
||||
border: 1px solid #2a8aad;
|
||||
background: #3584e4;
|
||||
font-weight: normal;
|
||||
color: #fff;
|
||||
}
|
||||
button:disabled {
|
||||
background: #9cbce5;
|
||||
}
|
||||
|
||||
#write form {
|
||||
margin: 1em 0 0 0;
|
||||
display: flex;
|
||||
|
@ -406,7 +426,7 @@
|
|||
throw new Error("already running");
|
||||
}
|
||||
controller.value = new AbortController();
|
||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: document.baseURI.replace(/\/+$/, '') })) {
|
||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
|
||||
const data = chunk.data;
|
||||
|
||||
if (data.stop) {
|
||||
|
@ -567,7 +587,7 @@
|
|||
runCompletion();
|
||||
}
|
||||
return html`
|
||||
<div>
|
||||
<div class="right">
|
||||
<button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
|
||||
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
|
||||
<button onclick=${reset}>Reset</button>
|
||||
|
@ -1015,6 +1035,10 @@
|
|||
}
|
||||
|
||||
function App(props) {
|
||||
useEffect(() => {
|
||||
const query = new URLSearchParams(location.search).get("q");
|
||||
if (query) chat(query);
|
||||
}, []);
|
||||
|
||||
return html`
|
||||
<div class="mode-${session.value.type}">
|
||||
|
|
|
@ -689,6 +689,7 @@ struct server_context {
|
|||
n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
add_bos_token = llama_should_add_bos_token(model);
|
||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -758,7 +759,7 @@ struct server_context {
|
|||
metrics.init();
|
||||
}
|
||||
|
||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const {
|
||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_special) const {
|
||||
// TODO: currently, we tokenize using special tokens by default
|
||||
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
|
||||
// but it's better compared to completely ignoring ChatML and other chat templates
|
||||
|
@ -776,7 +777,7 @@ struct server_context {
|
|||
|
||||
std::vector<llama_token> p;
|
||||
if (first) {
|
||||
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
||||
first = false;
|
||||
} else {
|
||||
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||
|
@ -793,7 +794,7 @@ struct server_context {
|
|||
}
|
||||
} else {
|
||||
auto s = json_prompt.template get<std::string>();
|
||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
||||
}
|
||||
|
||||
return prompt_tokens;
|
||||
|
@ -1058,7 +1059,7 @@ struct server_context {
|
|||
system_tokens.clear();
|
||||
|
||||
if (!system_prompt.empty()) {
|
||||
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
|
||||
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
|
||||
|
||||
llama_batch_clear(batch);
|
||||
|
||||
|
@ -1914,7 +1915,7 @@ struct server_context {
|
|||
prefix_tokens.push_back(llama_token_middle(model));
|
||||
prompt_tokens = prefix_tokens;
|
||||
} else {
|
||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
|
||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
||||
}
|
||||
|
||||
slot.n_past = 0;
|
||||
|
|
|
@ -76,6 +76,28 @@ int main(int argc, char ** argv) {
|
|||
params.n_threads_batch = params.n_threads_batch_draft;
|
||||
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
||||
|
||||
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
|
||||
LOG("vocab_type tgt: %d\n", vocab_type_tgt);
|
||||
|
||||
const bool vocab_type_dft = llama_vocab_type(model_dft);
|
||||
LOG("vocab_type dft: %d\n", vocab_type_dft);
|
||||
|
||||
if (vocab_type_tgt != vocab_type_dft) {
|
||||
fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__);
|
||||
fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (
|
||||
llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
|
||||
llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
|
||||
llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
|
||||
llama_token_eos(model_tgt) != llama_token_eos(model_dft)
|
||||
) {
|
||||
fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
{
|
||||
const int n_vocab_tgt = llama_n_vocab(model_tgt);
|
||||
const int n_vocab_dft = llama_n_vocab(model_dft);
|
||||
|
@ -105,20 +127,8 @@ int main(int argc, char ** argv) {
|
|||
|
||||
|
||||
// Tokenize the prompt
|
||||
const bool add_bos_tgt = llama_should_add_bos_token(model_tgt);
|
||||
LOG("add_bos tgt: %d\n", add_bos_tgt);
|
||||
|
||||
const bool add_bos_dft = llama_should_add_bos_token(model_dft);
|
||||
LOG("add_bos dft: %d\n", add_bos_dft);
|
||||
|
||||
if (add_bos_tgt != add_bos_dft) {
|
||||
fprintf(stderr, "%s: error: draft model add_bos must match target model to use speculation but ", __func__);
|
||||
fprintf(stderr, "add_bos_dft = %d while add_bos_tgt = %d\n", add_bos_dft, add_bos_tgt);
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true);
|
||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true);
|
||||
|
||||
const int max_context_size = llama_n_ctx(ctx_tgt);
|
||||
const int max_tokens_list_size = max_context_size - 4;
|
||||
|
|
|
@ -26,11 +26,9 @@ int main(int argc, char ** argv) {
|
|||
llama_context_params ctx_params = llama_context_default_params();
|
||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
|
||||
std::vector<llama_token> tokens;
|
||||
|
||||
tokens = ::llama_tokenize(model, prompt, add_bos, true);
|
||||
tokens = ::llama_tokenize(model, prompt, true, true);
|
||||
|
||||
for (int i = 0; i < (int) tokens.size(); i++) {
|
||||
if (printing_ids) {
|
||||
|
|
|
@ -1225,7 +1225,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|||
|
||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||
// ldc == nrows of the matrix that cuBLAS writes into
|
||||
int ldc = id == ctx.device ? ne0 : row_diff;
|
||||
int64_t ldc = id == ctx.device ? ne0 : row_diff;
|
||||
|
||||
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
||||
|
||||
|
@ -1377,8 +1377,8 @@ static void ggml_cuda_op_mul_mat(
|
|||
const int64_t ne0 = dst->ne[0];
|
||||
const int64_t ne1 = dst->ne[1];
|
||||
|
||||
const int nb2 = dst->nb[2];
|
||||
const int nb3 = dst->nb[3];
|
||||
const int64_t nb2 = dst->nb[2];
|
||||
const int64_t nb3 = dst->nb[3];
|
||||
|
||||
GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
|
||||
GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
|
||||
|
|
|
@ -394,7 +394,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
|||
// TODO: move to ggml-common.h
|
||||
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||
|
||||
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
||||
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
||||
|
||||
|
||||
//////////////////////
|
||||
|
|
|
@ -4,14 +4,14 @@
|
|||
#define CUDA_Q8_0_NE_ALIGN 2048
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
||||
const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
||||
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
||||
const int64_t i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int ib = i/qk; // block index
|
||||
const int64_t ib = i/qk; // block index
|
||||
const int iqs = (i%qk)/qr; // quant index
|
||||
const int iybs = i - i%qk; // y block start index
|
||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||
|
@ -25,7 +25,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
|
|||
}
|
||||
|
||||
template <bool need_check>
|
||||
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
|
||||
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
|
||||
#if __CUDA_ARCH__ >= CC_PASCAL
|
||||
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
||||
|
||||
|
@ -68,13 +68,13 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
|
|||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const int64_t i = blockIdx.x;
|
||||
|
||||
// assume 32 threads
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8;
|
||||
const int ir = tid%8;
|
||||
const int ib = 8*i + ir;
|
||||
const int64_t ib = 8*i + ir;
|
||||
if (ib >= nb32) {
|
||||
return;
|
||||
}
|
||||
|
@ -96,13 +96,13 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t
|
|||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const int64_t i = blockIdx.x;
|
||||
|
||||
// assume 32 threads
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8;
|
||||
const int ir = tid%8;
|
||||
const int ib = 8*i + ir;
|
||||
const int64_t ib = 8*i + ir;
|
||||
if (ib >= nb32) {
|
||||
return;
|
||||
}
|
||||
|
@ -313,14 +313,14 @@ template<typename dst_t>
|
|||
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const block_q6_K * x = (const block_q6_K *) vx;
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const int64_t i = blockIdx.x;
|
||||
#if QK_K == 256
|
||||
|
||||
// assume 64 threads - this is very slightly better than the one below
|
||||
const int tid = threadIdx.x;
|
||||
const int ip = tid/32; // ip is 0 or 1
|
||||
const int il = tid - 32*ip; // 0...32
|
||||
const int is = 8*ip + il/16;
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t ip = tid/32; // ip is 0 or 1
|
||||
const int64_t il = tid - 32*ip; // 0...32
|
||||
const int64_t is = 8*ip + il/16;
|
||||
|
||||
dst_t * y = yy + i*QK_K + 128*ip + il;
|
||||
|
||||
|
@ -337,9 +337,9 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
|||
#else
|
||||
|
||||
// assume 32 threads
|
||||
const int tid = threadIdx.x;
|
||||
const int ip = tid/16; // 0 or 1
|
||||
const int il = tid - 16*ip; // 0...15
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t ip = tid/16; // 0 or 1
|
||||
const int64_t il = tid - 16*ip; // 0...15
|
||||
|
||||
dst_t * y = yy + i*QK_K + 16*ip + il;
|
||||
|
||||
|
@ -571,12 +571,12 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
|
|||
#endif
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
|
||||
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||
}
|
||||
|
||||
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
|
||||
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
|
||||
const bool need_check = false;
|
||||
|
@ -588,7 +588,7 @@ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half *
|
|||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
|
@ -598,7 +598,7 @@ static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
|
@ -608,27 +608,27 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb32 = k / 32;
|
||||
const int nb = (k + 255) / 256;
|
||||
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb32 = k / 32;
|
||||
const int nb = (k + 255) / 256;
|
||||
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
|
@ -638,7 +638,7 @@ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
|
@ -648,55 +648,55 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = (k + QK_K - 1) / QK_K;
|
||||
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = (k + QK_K - 1) / QK_K;
|
||||
#if QK_K == 64
|
||||
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
||||
|
@ -706,8 +706,8 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k,
|
|||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
||||
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
|
@ -719,7 +719,7 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
|
|||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
||||
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||
}
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
||||
|
||||
template<typename T>
|
||||
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
|
||||
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
|
||||
|
||||
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
||||
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#include "common.cuh"
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
||||
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
const block_q4_0 * x = (const block_q4_0 *) vx;
|
||||
|
||||
const dfloat d = x[ib].d;
|
||||
|
@ -19,7 +19,7 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
|
|||
#endif // GGML_CUDA_F16
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
||||
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
const block_q4_1 * x = (const block_q4_1 *) vx;
|
||||
|
||||
const dfloat d = __low2half(x[ib].dm);
|
||||
|
@ -39,7 +39,7 @@ static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const in
|
|||
#endif // GGML_CUDA_F16
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
||||
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
const block_q5_0 * x = (const block_q5_0 *) vx;
|
||||
|
||||
const dfloat d = x[ib].d;
|
||||
|
@ -62,7 +62,7 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
|
|||
#endif // GGML_CUDA_F16
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
||||
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
const block_q5_1 * x = (const block_q5_1 *) vx;
|
||||
|
||||
const dfloat d = __low2half(x[ib].dm);
|
||||
|
@ -86,7 +86,7 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
|
|||
#endif // GGML_CUDA_F16
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
||||
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
const block_q8_0 * x = (const block_q8_0 *) vx;
|
||||
|
||||
const dfloat d = x[ib].d;
|
||||
|
|
|
@ -565,7 +565,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
|||
}
|
||||
}
|
||||
|
||||
static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
||||
static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
const half * x = (const half *) vx;
|
||||
|
||||
// automatic half -> float type cast if dfloat == float
|
||||
|
@ -577,7 +577,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
|||
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
||||
// qk = quantized weights per x block
|
||||
// qr = number of quantized weights per data value in x block
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
|
||||
|
||||
if (row >= nrows) {
|
||||
return;
|
||||
|
@ -598,7 +598,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|||
|
||||
for (int i = 0; i < ncols; i += iter_stride) {
|
||||
const int col = i + vals_per_iter*tid;
|
||||
const int ib = (row*ncols + col)/qk; // x block index
|
||||
const int64_t ib = ((int64_t)row*ncols + col)/qk; // x block index
|
||||
const int iqs = (col%qk)/qr; // x quant index
|
||||
const int iybs = col - col%qk; // y block start index
|
||||
|
||||
|
|
|
@ -1,20 +1,20 @@
|
|||
#include "quantize.cuh"
|
||||
|
||||
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
|
||||
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) {
|
||||
const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (ix >= kx_padded) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int iy = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y;
|
||||
|
||||
const int i_padded = iy*kx_padded + ix;
|
||||
const int64_t i_padded = (int64_t)iy*kx_padded + ix;
|
||||
|
||||
block_q8_1 * y = (block_q8_1 *) vy;
|
||||
|
||||
const int ib = i_padded / QK8_1; // block index
|
||||
const int iqs = i_padded % QK8_1; // quant index
|
||||
const int64_t ib = i_padded / QK8_1; // block index
|
||||
const int64_t iqs = i_padded % QK8_1; // quant index
|
||||
|
||||
const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
|
||||
float amax = fabsf(xi);
|
||||
|
@ -36,8 +36,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|||
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
||||
}
|
||||
|
||||
void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
|
||||
const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
||||
void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) {
|
||||
const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
||||
const dim3 num_blocks(block_num_x, ky, 1);
|
||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
||||
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
||||
|
|
|
@ -2,4 +2,4 @@
|
|||
|
||||
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
||||
|
||||
void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream);
|
||||
void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream);
|
||||
|
|
310
ggml-quants.c
310
ggml-quants.c
File diff suppressed because it is too large
Load diff
148
ggml-quants.h
148
ggml-quants.h
|
@ -12,70 +12,70 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
// Quantization
|
||||
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
// Dequantization
|
||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
@ -101,26 +101,26 @@ void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
void iq2xs_init_impl(enum ggml_type type);
|
||||
void iq2xs_free_impl(enum ggml_type type);
|
||||
|
|
16
ggml.c
16
ggml.c
|
@ -338,14 +338,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|||
return GGML_FP32_TO_FP16(x);
|
||||
}
|
||||
|
||||
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
|
||||
for (int64_t i = 0; i < n; i++) {
|
||||
y[i] = GGML_FP16_TO_FP32(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
|
||||
int i = 0;
|
||||
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
#if defined(__F16C__)
|
||||
for (; i + 7 < n; i += 8) {
|
||||
__m256 x_vec = _mm256_loadu_ps(x + i);
|
||||
|
@ -20331,11 +20331,11 @@ size_t ggml_quantize_chunk(
|
|||
enum ggml_type type,
|
||||
const float * src,
|
||||
void * dst,
|
||||
int start,
|
||||
int nrows,
|
||||
int n_per_row,
|
||||
int64_t start,
|
||||
int64_t nrows,
|
||||
int64_t n_per_row,
|
||||
const float * imatrix) {
|
||||
const int n = nrows * n_per_row;
|
||||
const int64_t n = (int64_t) nrows * n_per_row;
|
||||
|
||||
if (ggml_quantize_requires_imatrix(type)) {
|
||||
GGML_ASSERT(imatrix != NULL);
|
||||
|
|
14
ggml.h
14
ggml.h
|
@ -332,8 +332,8 @@ extern "C" {
|
|||
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
||||
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
||||
|
||||
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
|
||||
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
|
||||
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
|
||||
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
|
||||
|
||||
struct ggml_object;
|
||||
struct ggml_context;
|
||||
|
@ -2210,9 +2210,9 @@ extern "C" {
|
|||
enum ggml_type type,
|
||||
const float * src,
|
||||
void * dst,
|
||||
int start,
|
||||
int nrows,
|
||||
int n_per_row,
|
||||
int64_t start,
|
||||
int64_t nrows,
|
||||
int64_t n_per_row,
|
||||
const float * imatrix);
|
||||
|
||||
//
|
||||
|
@ -2377,8 +2377,8 @@ extern "C" {
|
|||
#else
|
||||
#define GGML_RESTRICT restrict
|
||||
#endif
|
||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||
|
||||
|
|
|
@ -639,6 +639,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
|
|
@ -285,12 +285,14 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
||||
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
||||
"model.layers.{bid}.self_attn.q_norm", # cohere
|
||||
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_K_NORM: (
|
||||
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
||||
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
||||
"model.layers.{bid}.self_attn.k_norm", # cohere
|
||||
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
||||
),
|
||||
|
||||
|
|
297
llama.cpp
297
llama.cpp
|
@ -318,6 +318,8 @@ enum llm_kv {
|
|||
LLM_KV_TOKENIZER_UNK_ID,
|
||||
LLM_KV_TOKENIZER_SEP_ID,
|
||||
LLM_KV_TOKENIZER_PAD_ID,
|
||||
LLM_KV_TOKENIZER_CLS_ID,
|
||||
LLM_KV_TOKENIZER_MASK_ID,
|
||||
LLM_KV_TOKENIZER_ADD_BOS,
|
||||
LLM_KV_TOKENIZER_ADD_EOS,
|
||||
LLM_KV_TOKENIZER_ADD_PREFIX,
|
||||
|
@ -388,6 +390,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
||||
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
||||
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
||||
{ LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
|
||||
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
|
||||
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
||||
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
||||
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
||||
|
@ -926,6 +930,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -1699,6 +1705,8 @@ enum e_model {
|
|||
MODEL_MEDIUM,
|
||||
MODEL_LARGE,
|
||||
MODEL_XL,
|
||||
MODEL_8x7B,
|
||||
MODEL_8x22B,
|
||||
};
|
||||
|
||||
static const size_t kiB = 1024;
|
||||
|
@ -2016,11 +2024,13 @@ struct llama_vocab {
|
|||
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
||||
|
||||
// default LLaMA special tokens
|
||||
id special_bos_id = 1;
|
||||
id special_eos_id = 2;
|
||||
id special_unk_id = 0;
|
||||
id special_sep_id = -1;
|
||||
id special_pad_id = -1;
|
||||
id special_bos_id = 1;
|
||||
id special_eos_id = 2;
|
||||
id special_unk_id = 0;
|
||||
id special_sep_id = -1;
|
||||
id special_pad_id = -1;
|
||||
id special_cls_id = -1;
|
||||
id special_mask_id = -1;
|
||||
|
||||
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
||||
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
||||
|
@ -3550,6 +3560,8 @@ static const char * llama_model_type_name(e_model type) {
|
|||
case MODEL_MEDIUM: return "0.4B";
|
||||
case MODEL_LARGE: return "0.8B";
|
||||
case MODEL_XL: return "1.5B";
|
||||
case MODEL_8x7B: return "8x7B";
|
||||
case MODEL_8x22B: return "8x22B";
|
||||
default: return "?B";
|
||||
}
|
||||
}
|
||||
|
@ -3664,15 +3676,23 @@ static void llm_load_hparams(
|
|||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 22: model.type = e_model::MODEL_1B; break;
|
||||
case 26: model.type = e_model::MODEL_3B; break;
|
||||
case 32: model.type = e_model::MODEL_7B; break;
|
||||
case 40: model.type = e_model::MODEL_13B; break;
|
||||
case 48: model.type = e_model::MODEL_34B; break;
|
||||
case 60: model.type = e_model::MODEL_30B; break;
|
||||
case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
if (hparams.n_expert == 8) {
|
||||
switch (hparams.n_layer) {
|
||||
case 32: model.type = e_model::MODEL_8x7B; break;
|
||||
case 56: model.type = e_model::MODEL_8x22B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} else {
|
||||
switch (hparams.n_layer) {
|
||||
case 22: model.type = e_model::MODEL_1B; break;
|
||||
case 26: model.type = e_model::MODEL_3B; break;
|
||||
case 32: model.type = e_model::MODEL_7B; break;
|
||||
case 40: model.type = e_model::MODEL_13B; break;
|
||||
case 48: model.type = e_model::MODEL_34B; break;
|
||||
case 60: model.type = e_model::MODEL_30B; break;
|
||||
case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_MINICPM:
|
||||
|
@ -3976,7 +3996,9 @@ static void llm_load_hparams(
|
|||
}
|
||||
|
||||
// TODO: This should probably be in llama.h
|
||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
|
||||
static std::vector<llama_vocab::id> llama_tokenize_internal(
|
||||
const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special = false
|
||||
);
|
||||
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
||||
|
||||
static void llm_load_vocab(
|
||||
|
@ -3998,23 +4020,27 @@ static void llm_load_vocab(
|
|||
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
||||
|
||||
// default special tokens
|
||||
vocab.special_bos_id = -1;
|
||||
vocab.special_eos_id = -1;
|
||||
vocab.special_unk_id = -1;
|
||||
vocab.special_sep_id = -1;
|
||||
vocab.special_pad_id = -1;
|
||||
vocab.linefeed_id = -1;
|
||||
vocab.special_bos_id = -1;
|
||||
vocab.special_eos_id = -1;
|
||||
vocab.special_unk_id = -1;
|
||||
vocab.special_sep_id = -1;
|
||||
vocab.special_pad_id = -1;
|
||||
vocab.special_cls_id = -1;
|
||||
vocab.special_mask_id = -1;
|
||||
vocab.linefeed_id = -1;
|
||||
|
||||
return;
|
||||
} else if (tokenizer_name == "llama") {
|
||||
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
||||
|
||||
// default special tokens
|
||||
vocab.special_bos_id = 1;
|
||||
vocab.special_eos_id = 2;
|
||||
vocab.special_unk_id = 0;
|
||||
vocab.special_sep_id = -1;
|
||||
vocab.special_pad_id = -1;
|
||||
vocab.special_bos_id = 1;
|
||||
vocab.special_eos_id = 2;
|
||||
vocab.special_unk_id = 0;
|
||||
vocab.special_sep_id = -1;
|
||||
vocab.special_pad_id = -1;
|
||||
vocab.special_cls_id = -1;
|
||||
vocab.special_mask_id = -1;
|
||||
|
||||
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
||||
if (add_space_prefix_keyidx != -1) {
|
||||
|
@ -4049,20 +4075,24 @@ static void llm_load_vocab(
|
|||
}
|
||||
|
||||
// default special tokens
|
||||
vocab.special_bos_id = 11;
|
||||
vocab.special_eos_id = 11;
|
||||
vocab.special_unk_id = -1;
|
||||
vocab.special_sep_id = -1;
|
||||
vocab.special_pad_id = -1;
|
||||
vocab.special_bos_id = 11;
|
||||
vocab.special_eos_id = 11;
|
||||
vocab.special_unk_id = -1;
|
||||
vocab.special_sep_id = -1;
|
||||
vocab.special_pad_id = -1;
|
||||
vocab.special_cls_id = -1;
|
||||
vocab.special_mask_id = -1;
|
||||
} else if (tokenizer_name == "bert") {
|
||||
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
||||
|
||||
// default special tokens
|
||||
vocab.special_bos_id = 101;
|
||||
vocab.special_eos_id = 102;
|
||||
vocab.special_unk_id = 100;
|
||||
vocab.special_sep_id = -1;
|
||||
vocab.special_pad_id = -1;
|
||||
vocab.special_bos_id = -1;
|
||||
vocab.special_eos_id = -1;
|
||||
vocab.special_unk_id = 100;
|
||||
vocab.special_sep_id = 102;
|
||||
vocab.special_pad_id = 0;
|
||||
vocab.special_cls_id = 101;
|
||||
vocab.special_mask_id = 103;
|
||||
vocab.add_space_prefix = false;
|
||||
} else {
|
||||
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
|
||||
|
@ -4125,11 +4155,13 @@ static void llm_load_vocab(
|
|||
// special tokens
|
||||
{
|
||||
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
|
||||
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
|
||||
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
|
||||
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
||||
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
||||
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
||||
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
|
||||
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
|
||||
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
||||
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
||||
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
||||
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
|
||||
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
|
||||
};
|
||||
for (const auto & it : special_token_types) {
|
||||
const std::string & key = kv(std::get<0>(it));
|
||||
|
@ -4321,12 +4353,14 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|||
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
||||
|
||||
// special tokens
|
||||
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
||||
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
||||
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
||||
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
||||
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
||||
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
||||
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
||||
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
||||
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
||||
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
||||
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
||||
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
||||
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
||||
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
||||
}
|
||||
|
||||
// Returns false if cancelled by progress_callback
|
||||
|
@ -5406,6 +5440,11 @@ static bool llm_load_tensors(
|
|||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
|
||||
if (n_layer >= 64){
|
||||
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head});
|
||||
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv});
|
||||
}
|
||||
|
||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||
|
@ -9454,6 +9493,31 @@ struct llm_build_context {
|
|||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
if (model.layers[il].attn_q_norm) {
|
||||
Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
|
||||
ggml_element_size(Qcur) * n_embd_head,
|
||||
ggml_element_size(Qcur) * n_embd_head * n_head,
|
||||
0);
|
||||
cb(Qcur, "Qcur", il);
|
||||
Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
|
||||
ggml_element_size(Kcur) * n_embd_head,
|
||||
ggml_element_size(Kcur) * n_embd_head * n_head_kv,
|
||||
0);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
||||
model.layers[il].attn_q_norm,
|
||||
NULL,
|
||||
LLM_NORM, cb, il);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
||||
model.layers[il].attn_k_norm,
|
||||
NULL,
|
||||
LLM_NORM, cb, il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
|
@ -11057,7 +11121,7 @@ struct llm_tokenizer_bpe {
|
|||
add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
|
||||
}
|
||||
|
||||
// add the fnished tokens to the final list keeping correct order for next and prev
|
||||
// add the finished tokens to the final list keeping correct order for next and prev
|
||||
for (auto & sym : symbols) {
|
||||
if (sym.n > 0) {
|
||||
sym.prev = final_prev_index;
|
||||
|
@ -11326,9 +11390,6 @@ struct llm_tokenizer_wpm {
|
|||
output.push_back(vocab.special_unk_id);
|
||||
}
|
||||
}
|
||||
|
||||
// append eos token
|
||||
output.push_back(vocab.special_eos_id);
|
||||
}
|
||||
|
||||
std::vector<std::string> preprocess(const std::string & text) {
|
||||
|
@ -11533,30 +11594,28 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|||
}
|
||||
}
|
||||
|
||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
|
||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) {
|
||||
std::vector<llama_vocab::id> output;
|
||||
|
||||
// OG tokenizer behavior:
|
||||
//
|
||||
// tokenizer.encode('', add_bos=True) returns [1]
|
||||
// tokenizer.encode('', add_bos=False) returns []
|
||||
|
||||
if (bos && vocab.special_bos_id != -1) {
|
||||
output.push_back(vocab.special_bos_id);
|
||||
}
|
||||
|
||||
if (raw_text.empty()) {
|
||||
return output;
|
||||
}
|
||||
|
||||
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
||||
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
||||
|
||||
if (special) tokenizer_st_partition(vocab, fragment_buffer);
|
||||
if (!raw_text.empty()) {
|
||||
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
||||
if (parse_special) tokenizer_st_partition(vocab, fragment_buffer);
|
||||
}
|
||||
|
||||
switch (vocab.type) {
|
||||
case LLAMA_VOCAB_TYPE_SPM:
|
||||
{
|
||||
// OG tokenizer behavior:
|
||||
//
|
||||
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
||||
// tokenizer.encode('', add_special_tokens=False) returns []
|
||||
|
||||
if (add_special && vocab.special_add_bos != 0) {
|
||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||
output.push_back(vocab.special_bos_id);
|
||||
}
|
||||
|
||||
for (const auto & fragment : fragment_buffer) {
|
||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
||||
|
@ -11582,9 +11641,19 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|||
output.push_back(fragment.token);
|
||||
}
|
||||
}
|
||||
|
||||
if (add_special && vocab.special_add_eos == 1) {
|
||||
GGML_ASSERT(vocab.special_eos_id != -1);
|
||||
output.push_back(vocab.special_eos_id);
|
||||
}
|
||||
} break;
|
||||
case LLAMA_VOCAB_TYPE_BPE:
|
||||
{
|
||||
if (add_special && vocab.special_add_bos == 1) {
|
||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||
output.push_back(vocab.special_bos_id);
|
||||
}
|
||||
|
||||
for (const auto & fragment : fragment_buffer) {
|
||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||
|
@ -11598,9 +11667,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|||
output.push_back(fragment.token);
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(vocab.special_add_eos != 1);
|
||||
} break;
|
||||
case LLAMA_VOCAB_TYPE_WPM:
|
||||
{
|
||||
if (add_special) {
|
||||
GGML_ASSERT(vocab.special_cls_id != -1);
|
||||
output.push_back(vocab.special_cls_id);
|
||||
}
|
||||
|
||||
for (const auto & fragment : fragment_buffer) {
|
||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||
|
@ -11614,6 +11690,11 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|||
output.push_back(fragment.token);
|
||||
}
|
||||
}
|
||||
|
||||
if (add_special) {
|
||||
GGML_ASSERT(vocab.special_sep_id != -1);
|
||||
output.push_back(vocab.special_sep_id);
|
||||
}
|
||||
} break;
|
||||
case LLAMA_VOCAB_TYPE_NONE:
|
||||
GGML_ASSERT(false);
|
||||
|
@ -11780,7 +11861,9 @@ static void llama_grammar_advance_stack(
|
|||
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
||||
|
||||
if (stack.empty()) {
|
||||
new_stacks.emplace_back(stack);
|
||||
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
||||
new_stacks.emplace_back(stack);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -11817,7 +11900,10 @@ static void llama_grammar_advance_stack(
|
|||
}
|
||||
case LLAMA_GRETYPE_CHAR:
|
||||
case LLAMA_GRETYPE_CHAR_NOT:
|
||||
new_stacks.emplace_back(stack);
|
||||
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
||||
// only add the stack if it's not a duplicate of one we already have
|
||||
new_stacks.emplace_back(stack);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
||||
|
@ -11831,12 +11917,13 @@ static void llama_grammar_advance_stack(
|
|||
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
||||
// produces the N possible stacks if the given char is accepted at those
|
||||
// positions
|
||||
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
||||
void llama_grammar_accept(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
||||
const uint32_t chr) {
|
||||
const uint32_t chr,
|
||||
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
||||
|
||||
std::vector<std::vector<const llama_grammar_element *>> new_stacks;
|
||||
new_stacks.clear();
|
||||
|
||||
for (const auto & stack : stacks) {
|
||||
if (stack.empty()) {
|
||||
|
@ -11855,8 +11942,6 @@ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
|||
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
||||
}
|
||||
}
|
||||
|
||||
return new_stacks;
|
||||
}
|
||||
|
||||
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
||||
|
@ -11870,6 +11955,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|||
const std::vector<llama_grammar_candidate> & candidates) {
|
||||
|
||||
std::vector<llama_grammar_candidate> rejects;
|
||||
rejects.reserve(candidates.size());
|
||||
|
||||
if (stack.empty()) {
|
||||
for (const auto & tok : candidates) {
|
||||
|
@ -11883,6 +11969,8 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|||
const llama_grammar_element * stack_pos = stack.back();
|
||||
|
||||
std::vector<llama_grammar_candidate> next_candidates;
|
||||
next_candidates.reserve(candidates.size());
|
||||
|
||||
for (const auto & tok : candidates) {
|
||||
if (*tok.code_points == 0) {
|
||||
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
||||
|
@ -12690,8 +12778,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|||
// Note terminating 0 in decoded string
|
||||
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
||||
const auto & code_points = decoded.first;
|
||||
std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
|
||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
||||
llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
|
||||
grammar->stacks = tmp_new_stacks;
|
||||
}
|
||||
grammar->partial_utf8 = decoded.second;
|
||||
GGML_ASSERT(!grammar->stacks.empty());
|
||||
|
@ -13323,9 +13413,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|||
return new_type;
|
||||
}
|
||||
|
||||
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
||||
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
||||
std::mutex mutex;
|
||||
int counter = 0;
|
||||
int64_t counter = 0;
|
||||
size_t new_size = 0;
|
||||
if (nthread < 2) {
|
||||
// single-thread
|
||||
|
@ -13333,11 +13423,11 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|||
}
|
||||
auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
|
||||
nrows, n_per_row, imatrix]() {
|
||||
const int nrows_per_chunk = chunk_size / n_per_row;
|
||||
const int64_t nrows_per_chunk = chunk_size / n_per_row;
|
||||
size_t local_size = 0;
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex);
|
||||
int first_row = counter; counter += nrows_per_chunk;
|
||||
int64_t first_row = counter; counter += nrows_per_chunk;
|
||||
if (first_row >= nrows) {
|
||||
if (local_size > 0) {
|
||||
new_size += local_size;
|
||||
|
@ -13345,7 +13435,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|||
break;
|
||||
}
|
||||
lock.unlock();
|
||||
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
||||
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
||||
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
||||
}
|
||||
};
|
||||
|
@ -13468,7 +13558,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
const std::string name = ggml_get_name(meta);
|
||||
|
||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
||||
if (name.find("attn_v.weight") != std::string::npos ||
|
||||
name.find("attn_qkv.weight") != std::string::npos) {
|
||||
++qs.n_attention_wv;
|
||||
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
||||
qs.has_output = true;
|
||||
|
@ -13478,7 +13569,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
||||
|
||||
// sanity checks
|
||||
GGML_ASSERT(qs.n_attention_wv == (int)model.hparams.n_layer && "n_attention_wv != n_layer is unexpected");
|
||||
//
|
||||
// - qs.n_attention_wv == 0 for Mamba models
|
||||
// - qs.n_attention_wv == model.hparams.n_layer for Transformer models
|
||||
//
|
||||
GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
|
||||
|
||||
size_t total_size_org = 0;
|
||||
size_t total_size_new = 0;
|
||||
|
@ -13534,6 +13629,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
|
||||
// quantize only 2D and 3D tensors (experts)
|
||||
quantize &= (ggml_n_dims(tensor) >= 2);
|
||||
|
||||
// do not quantize norm tensors
|
||||
quantize &= name.find("_norm.weight") == std::string::npos;
|
||||
|
||||
quantize &= params->quantize_output_tensor || name != "output.weight";
|
||||
quantize &= !params->only_copy;
|
||||
|
||||
|
@ -13580,7 +13679,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
new_size = ggml_nbytes(tensor);
|
||||
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||
} else {
|
||||
const size_t nelements = ggml_nelements(tensor);
|
||||
const int64_t nelements = ggml_nelements(tensor);
|
||||
|
||||
const float * imatrix = nullptr;
|
||||
if (imatrix_data) {
|
||||
|
@ -13632,20 +13731,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
||||
fflush(stdout);
|
||||
|
||||
if (work.size() < nelements * 4) {
|
||||
if (work.size() < (size_t)nelements * 4) {
|
||||
work.resize(nelements * 4); // upper bound on size
|
||||
}
|
||||
new_data = work.data();
|
||||
|
||||
const int n_per_row = tensor->ne[0];
|
||||
const int nrows = tensor->ne[1];
|
||||
const int64_t n_per_row = tensor->ne[0];
|
||||
const int64_t nrows = tensor->ne[1];
|
||||
|
||||
static const int min_chunk_size = 32 * 512;
|
||||
const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
||||
static const int64_t min_chunk_size = 32 * 512;
|
||||
const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
||||
|
||||
const int nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
||||
const int nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
||||
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
||||
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
||||
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
||||
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
||||
|
||||
// quantize each expert separately since they have different importance matrices
|
||||
new_size = 0;
|
||||
|
@ -16063,6 +16162,14 @@ llama_token llama_token_eos(const struct llama_model * model) {
|
|||
return model->vocab.special_eos_id;
|
||||
}
|
||||
|
||||
llama_token llama_token_cls(const struct llama_model * model) {
|
||||
return model->vocab.special_cls_id;
|
||||
}
|
||||
|
||||
llama_token llama_token_sep(const struct llama_model * model) {
|
||||
return model->vocab.special_sep_id;
|
||||
}
|
||||
|
||||
llama_token llama_token_nl(const struct llama_model * model) {
|
||||
return model->vocab.linefeed_id;
|
||||
}
|
||||
|
@ -16097,9 +16204,9 @@ int32_t llama_tokenize(
|
|||
int32_t text_len,
|
||||
llama_token * tokens,
|
||||
int32_t n_tokens_max,
|
||||
bool add_bos,
|
||||
bool special) {
|
||||
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
|
||||
bool add_special,
|
||||
bool parse_special) {
|
||||
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special);
|
||||
|
||||
if (n_tokens_max < (int) res.size()) {
|
||||
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
||||
|
|
15
llama.h
15
llama.h
|
@ -786,6 +786,8 @@ extern "C" {
|
|||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
|
||||
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
|
||||
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
||||
|
||||
// Returns -1 if unknown, 1 for true or 0 for false.
|
||||
|
@ -808,16 +810,16 @@ extern "C" {
|
|||
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
||||
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
||||
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
||||
/// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
|
||||
/// Does not insert a leading space.
|
||||
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
||||
/// as plaintext. Does not insert a leading space.
|
||||
LLAMA_API int32_t llama_tokenize(
|
||||
const struct llama_model * model,
|
||||
const char * text,
|
||||
int32_t text_len,
|
||||
llama_token * tokens,
|
||||
int32_t n_tokens_max,
|
||||
bool add_bos,
|
||||
bool special);
|
||||
bool add_special,
|
||||
bool parse_special);
|
||||
|
||||
// Token Id -> Piece.
|
||||
// Uses the vocabulary in the provided context.
|
||||
|
@ -1095,10 +1097,11 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
|||
struct llama_context * ctx
|
||||
);
|
||||
|
||||
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
||||
void llama_grammar_accept(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
||||
const uint32_t chr);
|
||||
const uint32_t chr,
|
||||
std::vector<std::vector<const llama_grammar_element *>> & new_stacks);
|
||||
|
||||
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
||||
const std::string & src,
|
||||
|
|
9
scripts/gen-authors.sh
Executable file
9
scripts/gen-authors.sh
Executable file
|
@ -0,0 +1,9 @@
|
|||
#!/bin/bash
|
||||
|
||||
printf "# date: $(date)\n" > AUTHORS
|
||||
printf "# this file is auto-generated by scripts/gen-authors.sh\n\n" >> AUTHORS
|
||||
|
||||
git log --format='%an <%ae>' --reverse --date=short master | awk '!seen[$0]++' | sort >> AUTHORS
|
||||
|
||||
# if necessary, update your name here. for example: jdoe -> John Doe
|
||||
sed -i '' 's/^jdoe/John Doe/g' AUTHORS
|
|
@ -3,9 +3,9 @@
|
|||
# Shortcut for downloading HF models
|
||||
#
|
||||
# Usage:
|
||||
# ./main -m $(./examples/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||
# ./main -m $(./examples/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||
# ./main -m $(./examples/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||
# ./main -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||
# ./main -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||
# ./main -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||
#
|
||||
|
||||
# all logs go to stderr
|
||||
|
@ -14,7 +14,7 @@ function log {
|
|||
}
|
||||
|
||||
function usage {
|
||||
log "Usage: $0 [[--url] <url>] [--repo <repo>] [--file <file>] [-h|--help]"
|
||||
log "Usage: $0 [[--url] <url>] [--repo <repo>] [--file <file>] [--outdir <dir> [-h|--help]"
|
||||
exit 1
|
||||
}
|
||||
|
||||
|
@ -26,9 +26,9 @@ function has_cmd {
|
|||
}
|
||||
|
||||
if has_cmd wget; then
|
||||
cmd="wget -q --show-progress -c -O %s %s"
|
||||
cmd="wget -q --show-progress -c -O %s/%s %s"
|
||||
elif has_cmd curl; then
|
||||
cmd="curl -C - -f -o %s -L %s"
|
||||
cmd="curl -C - -f --output-dir %s -o %s -L %s"
|
||||
else
|
||||
log "[E] curl or wget not found"
|
||||
exit 1
|
||||
|
@ -37,6 +37,7 @@ fi
|
|||
url=""
|
||||
repo=""
|
||||
file=""
|
||||
outdir="."
|
||||
|
||||
# parse args
|
||||
while [[ $# -gt 0 ]]; do
|
||||
|
@ -53,6 +54,10 @@ while [[ $# -gt 0 ]]; do
|
|||
file="$2"
|
||||
shift 2
|
||||
;;
|
||||
--outdir)
|
||||
outdir="$2"
|
||||
shift 2
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
;;
|
||||
|
@ -94,10 +99,10 @@ basename=$(basename $url)
|
|||
log "[+] attempting to download $basename"
|
||||
|
||||
if [ -n "$cmd" ]; then
|
||||
cmd=$(printf "$cmd" "$basename" "$url")
|
||||
cmd=$(printf "$cmd" "$outdir" "$basename" "$url")
|
||||
log "[+] $cmd"
|
||||
if $cmd; then
|
||||
echo $basename
|
||||
echo $outdir/$basename
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
|
|
@ -66,6 +66,8 @@ while read c; do
|
|||
tests/test-quantize-fns.cpp \
|
||||
tests/test-quantize-perf.cpp \
|
||||
tests/test-backend-ops.cpp \
|
||||
LICENSE \
|
||||
scripts/gen-authors.sh \
|
||||
>> $SRC_LLAMA/ggml-src.patch
|
||||
done < $SRC_LLAMA/ggml-commits
|
||||
|
||||
|
@ -123,6 +125,9 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
|||
# tests/test-quantize-fns.cpp -> tests/test-quantize-fns.cpp
|
||||
# tests/test-quantize-perf.cpp -> tests/test-quantize-perf.cpp
|
||||
# tests/test-backend-ops.cpp -> tests/test-backend-ops.cpp
|
||||
#
|
||||
# LICENSE -> LICENSE
|
||||
# scripts/gen-authors.sh -> scripts/gen-authors.sh
|
||||
|
||||
cat ggml-src.patch | sed \
|
||||
-e 's/src\/ggml\.c/ggml.c/g' \
|
||||
|
@ -156,6 +161,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
|||
-e 's/tests\/test-quantize-fns\.cpp/tests\/test-quantize-fns.cpp/g' \
|
||||
-e 's/tests\/test-quantize-perf\.cpp/tests\/test-quantize-perf.cpp/g' \
|
||||
-e 's/tests\/test-backend-ops\.cpp/tests\/test-backend-ops.cpp/g' \
|
||||
-e 's/LICENSE/LICENSE/g' \
|
||||
-e 's/scripts\/gen-authors\.sh/scripts\/gen-authors.sh/g' \
|
||||
> ggml-src.patch.tmp
|
||||
mv ggml-src.patch.tmp ggml-src.patch
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
bb8d8cff851b2de6fde4904be492d39458837e1a
|
||||
98875cdb7e9ceeb726d1c196d2fecb3cbb59b93a
|
||||
|
|
|
@ -31,3 +31,6 @@ cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h
|
|||
cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp
|
||||
cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp
|
||||
cp -rpv ../ggml/tests/test-backend-ops.cpp ./tests/test-backend-ops.cpp
|
||||
|
||||
cp -rpv ../LICENSE ./LICENSE
|
||||
cp -rpv ../ggml/scripts/gen-authors.sh ./scripts/gen-authors.sh
|
||||
|
|
|
@ -38,7 +38,7 @@ number ::= [0-9]+)""";
|
|||
|
||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||
auto prev_stacks = grammar->stacks;
|
||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
||||
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
|
||||
assert(!grammar->stacks.empty());
|
||||
}
|
||||
|
||||
|
@ -138,7 +138,7 @@ ws ::= [ \t\n\r]?)""";
|
|||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||
++pos;
|
||||
auto prev_stacks = grammar->stacks;
|
||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
||||
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
|
||||
|
||||
// Expect that each code point will not cause the grammar to fail
|
||||
if (grammar->stacks.empty()) {
|
||||
|
@ -173,7 +173,7 @@ ws ::= [ \t\n\r]?)""";
|
|||
|
||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||
auto prev_stacks = grammar->stacks;
|
||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
||||
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
|
||||
if (grammar->stacks.empty()) {
|
||||
parse_failed = true;
|
||||
break;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue