Merge remote-tracking branch 'origin/master' into hp/model/support-dbrx
This commit is contained in:
commit
6f813dcc6a
51 changed files with 5265 additions and 3236 deletions
655
AUTHORS
Normal file
655
AUTHORS
Normal file
|
@ -0,0 +1,655 @@
|
|||
# date: Tue Apr 9 09:17:14 EEST 2024
|
||||
# this file is auto-generated by scripts/gen-authors.sh
|
||||
|
||||
0cc4m <picard12@live.de>
|
||||
0xspringtime <110655352+0xspringtime@users.noreply.github.com>
|
||||
2f38b454 <dxf@protonmail.com>
|
||||
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
|
||||
44670 <44670@users.noreply.github.com>
|
||||
AN Long <aisk@users.noreply.github.com>
|
||||
AT <manyoso@users.noreply.github.com>
|
||||
Aarni Koskela <akx@iki.fi>
|
||||
Aaron Miller <apage43@ninjawhale.com>
|
||||
Aaryaman Vasishta <aaryaman.vasishta@amd.com>
|
||||
Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
|
||||
Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com>
|
||||
Adithya Balaji <adithya.b94@gmail.com>
|
||||
AdithyanI <adithyan.i4internet@gmail.com>
|
||||
Adrian <smith.adriane@gmail.com>
|
||||
Adrian Hesketh <a-h@users.noreply.github.com>
|
||||
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
|
||||
Aisuko <urakiny@gmail.com>
|
||||
Alberto <57916483+albbus-stack@users.noreply.github.com>
|
||||
Alex <awhill19@icloud.com>
|
||||
Alex Azarov <alex@azarov.by>
|
||||
Alex Azarov <alexander.azarov@mapbox.com>
|
||||
Alex Klinkhamer <from.github.com.917@grencez.dev>
|
||||
Alex Klinkhamer <git@grencez.dev>
|
||||
Alex Nguyen <tiendung@users.noreply.github.com>
|
||||
Alex Petenchea <alex.petenchea@gmail.com>
|
||||
Alex Renda <alexrenda@users.noreply.github.com>
|
||||
Alex von Gluck IV <kallisti5@unixzen.com>
|
||||
Alexey Parfenov <zxed@alkatrazstudio.net>
|
||||
Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
|
||||
Ali Nehzat <ali.nehzat@thanks.dev>
|
||||
Ali Tariq <ali.tariq@10xengineers.ai>
|
||||
Alon <alonfaraj@gmail.com>
|
||||
AlpinDale <52078762+AlpinDale@users.noreply.github.com>
|
||||
AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
|
||||
Ananta Bastola <anantarajbastola@gmail.com>
|
||||
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
|
||||
András Salamon <ott2@users.noreply.github.com>
|
||||
Andrei <abetlen@gmail.com>
|
||||
Andrew Canis <andrew.canis@gmail.com>
|
||||
Andrew Duffy <a10y@users.noreply.github.com>
|
||||
Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
|
||||
Arik Poznanski <arikpoz@users.noreply.github.com>
|
||||
Artem <guinmoon@gmail.com>
|
||||
Artyom Lebedev <vagran.ast@gmail.com>
|
||||
Asbjørn Olling <asbjornolling@gmail.com>
|
||||
Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
|
||||
Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
|
||||
Ashraful Islam <ashraful.meche@gmail.com>
|
||||
Atsushi Tatsuma <yoshoku@outlook.com>
|
||||
Austin <77757836+teleprint-me@users.noreply.github.com>
|
||||
AustinMroz <austinmroz@utexas.edu>
|
||||
BADR <contact@pythops.com>
|
||||
Bach Le <bach@bullno1.com>
|
||||
Bailey Chittle <39804642+bachittle@users.noreply.github.com>
|
||||
BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
|
||||
Behnam M <58621210+ibehnam@users.noreply.github.com>
|
||||
Ben Garney <bengarney@users.noreply.github.com>
|
||||
Ben Siraphob <bensiraphob@gmail.com>
|
||||
Ben Williams <ben@719ben.com>
|
||||
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
|
||||
Bernat Vadell <hounter.caza@gmail.com>
|
||||
Bodo Graumann <mail@bodograumann.de>
|
||||
Bono Lv <lvscar@users.noreply.github.com>
|
||||
Borislav Stanimirov <b.stanimirov@abv.bg>
|
||||
Branden Butler <bwtbutler@hotmail.com>
|
||||
Brian <mofosyne@gmail.com>
|
||||
Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||
CJ Pais <cj@cjpais.com>
|
||||
CRD716 <crd716@gmail.com>
|
||||
Cameron <csteele@steelecameron.com>
|
||||
Cameron Kaiser <classilla@users.noreply.github.com>
|
||||
Casey Primozic <casey@cprimozic.net>
|
||||
Casey Primozic <me@ameo.link>
|
||||
CausalLM <148736309+CausalLM@users.noreply.github.com>
|
||||
Cebtenzzre <cebtenzzre@gmail.com>
|
||||
Chad Brewbaker <crb002@gmail.com>
|
||||
Cheng Shao <terrorjack@type.dance>
|
||||
Chris Kuehl <ckuehl@ckuehl.me>
|
||||
Christian Demsar <christian@github.email.demsar.us>
|
||||
Christian Demsar <crasm@git.vczf.us>
|
||||
Christian Falch <875252+chrfalch@users.noreply.github.com>
|
||||
Christian Kögler <ck3d@gmx.de>
|
||||
Clark Saben <76020733+csaben@users.noreply.github.com>
|
||||
Clint Herron <hanclinto@gmail.com>
|
||||
Cuong Trinh Manh <nguoithichkhampha@gmail.com>
|
||||
DAN™ <dranger003@gmail.com>
|
||||
Damian Stewart <d@damianstewart.com>
|
||||
Dane Madsen <dane_madsen@hotmail.com>
|
||||
DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
|
||||
Daniel Bevenius <daniel.bevenius@gmail.com>
|
||||
Daniel Drake <drake@endlessos.org>
|
||||
Daniel Hiltgen <dhiltgen@users.noreply.github.com>
|
||||
Daniel Illescas Romero <illescas.daniel@protonmail.com>
|
||||
DannyDaemonic <DannyDaemonic@gmail.com>
|
||||
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
|
||||
Dave Della Costa <ddellacosta+github@gmail.com>
|
||||
David Friehs <david@friehs.info>
|
||||
David Kennedy <dakennedyd@gmail.com>
|
||||
David Pflug <david@pflug.email>
|
||||
David Renshaw <dwrenshaw@gmail.com>
|
||||
David Sommers <12738+databyte@users.noreply.github.com>
|
||||
David Yang <davidyang6us@gmail.com>
|
||||
Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
|
||||
Dean <Dean.Sinaean@gmail.com>
|
||||
Deins <deinsegle@gmail.com>
|
||||
Didzis Gosko <didzis@users.noreply.github.com>
|
||||
Don Mahurin <dmahurin@users.noreply.github.com>
|
||||
DooWoong Lee (David) <manics99@naver.com>
|
||||
Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
|
||||
Douglas Hanley <thesecretaryofwar@gmail.com>
|
||||
Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
|
||||
Ebey Abraham <ebey97@gmail.com>
|
||||
Ed Lee <edilee@mozilla.com>
|
||||
Ed Lepedus <ed.lepedus@googlemail.com>
|
||||
Edward Taylor <edeetee@gmail.com>
|
||||
Elbios <141279586+Elbios@users.noreply.github.com>
|
||||
Engininja2 <139037756+Engininja2@users.noreply.github.com>
|
||||
Equim <sayaka@ekyu.moe>
|
||||
Eric Sommerlade <es0m@users.noreply.github.com>
|
||||
Eric Zhang <34133756+EZForever@users.noreply.github.com>
|
||||
Erik Garrison <erik.garrison@gmail.com>
|
||||
Erik Scholz <Green-Sky@users.noreply.github.com>
|
||||
Ettore Di Giacinto <mudler@users.noreply.github.com>
|
||||
Evan Jones <evan.q.jones@gmail.com>
|
||||
Evan Miller <emmiller@gmail.com>
|
||||
Eve <139727413+netrunnereve@users.noreply.github.com>
|
||||
Evgeny Kurnevsky <kurnevsky@gmail.com>
|
||||
Ewout ter Hoeven <E.M.terHoeven@student.tudelft.nl>
|
||||
ExtReMLapin <3909752+ExtReMLapin@users.noreply.github.com>
|
||||
FK <sozforex@gmail.com>
|
||||
Fabian <cmdrf@users.noreply.github.com>
|
||||
Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
|
||||
Faez Shakil <faez.shakil@gmail.com>
|
||||
FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
|
||||
Fattire <528174+fat-tire@users.noreply.github.com>
|
||||
Felix <stenbackfelix@gmail.com>
|
||||
Finn Voorhees <finnvoorhees@gmail.com>
|
||||
Firat <firatkiral@gmail.com>
|
||||
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
|
||||
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
|
||||
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
|
||||
FrankHB <frankhb1989@gmail.com>
|
||||
Frederik Vogel <Schaltfehler@users.noreply.github.com>
|
||||
Gabe Goodhart <gabe.l.hart@gmail.com>
|
||||
GainLee <perfecter.gen@gmail.com>
|
||||
Galunid <karolek1231456@gmail.com>
|
||||
Gary Linscott <glinscott@gmail.com>
|
||||
Gary Mulder <gjmulder@gmail.com>
|
||||
Genkagaku.GPT <hlhr202@163.com>
|
||||
Georgi Gerganov <ggerganov@gmail.com>
|
||||
Gilad S <giladgd@users.noreply.github.com>
|
||||
GiviMAD <GiviMAD@users.noreply.github.com>
|
||||
Govlzkoy <gotope@users.noreply.github.com>
|
||||
Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
|
||||
Guillaume Wenzek <gwenzek@users.noreply.github.com>
|
||||
Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
|
||||
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
|
||||
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
|
||||
Haohui Mai <ricetons@gmail.com>
|
||||
Haoxiang Fei <tonyfettes@tonyfettes.com>
|
||||
Harald Fernengel <harald.fernengel@here.com>
|
||||
Hatsune Miku <129688334+at8u@users.noreply.github.com>
|
||||
Henk Poley <HenkPoley@gmail.com>
|
||||
Henri Vasserman <henv@hot.ee>
|
||||
Henrik Forstén <henrik.forsten@gmail.com>
|
||||
Herman Semenov <GermanAizek@yandex.ru>
|
||||
Hesen Peng <hesen.peng@gmail.com>
|
||||
Hoang Nguyen <hugo53@users.noreply.github.com>
|
||||
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
|
||||
Howard Su <howard0su@gmail.com>
|
||||
Hua Jiang <allenhjiang@outlook.com>
|
||||
Huawei Lin <huaweilin.cs@gmail.com>
|
||||
Ian Bull <irbull@eclipsesource.com>
|
||||
Ian Bull <irbull@gmail.com>
|
||||
Ian Scrivener <github@zilogy.asia>
|
||||
Ido S <ido.pluto@gmail.com>
|
||||
IgnacioFDM <ignaciofdm@gmail.com>
|
||||
Igor Okulist <okigan@gmail.com>
|
||||
Ikko Eltociear Ashimine <eltociear@gmail.com>
|
||||
Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
|
||||
Ionoclast Laboratories <brigham@ionoclast.com>
|
||||
Isaac McFadyen <isaac@imcf.me>
|
||||
IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
|
||||
Ivan Komarov <Ivan.Komarov@dfyz.info>
|
||||
Ivan Stepanov <ivanstepanovftw@gmail.com>
|
||||
JH23X <165871467+JH23X@users.noreply.github.com>
|
||||
Jack Mousseau <jmousseau@users.noreply.github.com>
|
||||
JackJollimore <130917767+JackJollimore@users.noreply.github.com>
|
||||
Jag Chadha <jagtesh@gmail.com>
|
||||
Jakub N <jakubniemczyk97@gmail.com>
|
||||
James Reynolds <magnusviri@users.noreply.github.com>
|
||||
Jan Boon <jan.boon@kaetemi.be>
|
||||
Jan Boon <kaetemi@gmail.com>
|
||||
Jan Ploski <jpl@plosquare.com>
|
||||
Jannis Schönleber <joennlae@gmail.com>
|
||||
Jared Van Bortel <cebtenzzre@gmail.com>
|
||||
Jared Van Bortel <jared@nomic.ai>
|
||||
Jason McCartney <jmac@theroot.org>
|
||||
Jean-Christophe Hoelt <hoelt@fovea.cc>
|
||||
Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
|
||||
Jed Fox <git@jedfox.com>
|
||||
Jeffrey Quesnelle <emozilla@nousresearch.com>
|
||||
Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
|
||||
Jhen-Jie Hong <iainst0409@gmail.com>
|
||||
Jiahao Li <liplus17@163.com>
|
||||
Jian Liao <jianliao@users.noreply.github.com>
|
||||
JidongZhang-THU <1119708529@qq.com>
|
||||
Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
|
||||
Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
|
||||
Johannes Gäßler <johannesg@5d6.de>
|
||||
Johannes Rudolph <johannes.rudolph@gmail.com>
|
||||
John <78893154+cmp-nct@users.noreply.github.com>
|
||||
John Balis <phobossystems@gmail.com>
|
||||
John Smith <67539080+kingsidelee@users.noreply.github.com>
|
||||
JohnnyB <jboero@users.noreply.github.com>
|
||||
Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com>
|
||||
Jorge A <161275481+jorgealias@users.noreply.github.com>
|
||||
Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com>
|
||||
Joseph Stahl <1269177+josephst@users.noreply.github.com>
|
||||
Joyce <joycebrum@google.com>
|
||||
Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
|
||||
Judd <foldl@users.noreply.github.com>
|
||||
Julius Arkenberg <arki05@users.noreply.github.com>
|
||||
Jun Jie <71215065+junnjiee16@users.noreply.github.com>
|
||||
Juraj Bednar <juraj@bednar.io>
|
||||
Justin Parker <jparkerweb@gmail.com>
|
||||
Justin Suess <justin.suess@westpoint.edu>
|
||||
Justine Tunney <jtunney@gmail.com>
|
||||
Juuso Alasuutari <juuso.alasuutari@gmail.com>
|
||||
KASR <karim.asrih@gmail.com>
|
||||
Kamil Tomšík <info@tomsik.cz>
|
||||
Karsten Weiss <knweiss@gmail.com>
|
||||
Karthick <j.karthic2004@gmail.com>
|
||||
Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com>
|
||||
Karthik Sethuraman <k.seth1993@gmail.com>
|
||||
Kasumi <90275229+kasumi-1@users.noreply.github.com>
|
||||
Kawrakow <48489457+ikawrakow@users.noreply.github.com>
|
||||
Keiichi Tabata <keiichi.tabata@outlook.com>
|
||||
Kenvix ⭐ <kenvixzure@live.com>
|
||||
Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
|
||||
Kevin Ji <1146876+kevinji@users.noreply.github.com>
|
||||
Kevin Kwok <antimatter15@gmail.com>
|
||||
Kevin Lo <kevlo@kevlo.org>
|
||||
Kolen Cheung <ickc@users.noreply.github.com>
|
||||
Konstantin Herud <konstantin.herud@denkbares.com>
|
||||
Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
|
||||
Kunshang Ji <kunshang.ji@intel.com>
|
||||
Kyle Liang <liangmanlai@gmail.com>
|
||||
Kyle Mistele <kyle@mistele.com>
|
||||
Kylin <56434533+KyL0N@users.noreply.github.com>
|
||||
Lars Grammel <lars.grammel@gmail.com>
|
||||
Laura <Tijntje_7@msn.com>
|
||||
Lee <44310445+lx200916@users.noreply.github.com>
|
||||
Lee Drake <b.lee.drake@gmail.com>
|
||||
Leng Yue <lengyue@lengyue.me>
|
||||
LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
|
||||
Leonardo Neumann <leonardo@neumann.dev.br>
|
||||
Li Tan <tanliboy@gmail.com>
|
||||
Linwei Wang <wanix1988@gmail.com>
|
||||
LoganDark <github@logandark.mozmail.com>
|
||||
LostRuins <39025047+LostRuins@users.noreply.github.com>
|
||||
Luciano <lucianostrika44@gmail.com>
|
||||
Luo Tian <lt@basecity.com>
|
||||
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
|
||||
Maarten ter Huurne <maarten@treewalker.org>
|
||||
Mack Straight <eiz@users.noreply.github.com>
|
||||
Maël Kerbiriou <m431.kerbiriou@gmail.com>
|
||||
MaggotHATE <clay1326@gmail.com>
|
||||
Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
|
||||
Marco Matthies <71844+marcom@users.noreply.github.com>
|
||||
Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
|
||||
Marian Cepok <marian.cepok@gmail.com>
|
||||
Mark Fairbairn <thebaron88@gmail.com>
|
||||
Marko Tasic <mtasic85@gmail.com>
|
||||
Martin Krasser <krasserm@googlemail.com>
|
||||
Martin Schwaighofer <mschwaig@users.noreply.github.com>
|
||||
Marvin Gießing <marvin.giessing@gmail.com>
|
||||
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
|
||||
Matheus C. França <matheus-catarino@hotmail.com>
|
||||
Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
|
||||
Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
|
||||
Mathijs de Bruin <mathijs@mathijsfietst.nl>
|
||||
Matt Clayton <156335168+mattjcly@users.noreply.github.com>
|
||||
Matt Pulver <matt.pulver@heavy.ai>
|
||||
Matteo Boschini <12133566+mbosc@users.noreply.github.com>
|
||||
Matthew Tejo <matthew.tejo@gmail.com>
|
||||
Matvey Soloviev <blackhole89@gmail.com>
|
||||
Maxime <672982+maximegmd@users.noreply.github.com>
|
||||
Maximilian Winter <maximilian.winter.91@gmail.com>
|
||||
Meng Zhang <meng@tabbyml.com>
|
||||
Meng, Hengyu <hengyu.meng@intel.com>
|
||||
Merrick Christensen <merrick.christensen@gmail.com>
|
||||
Michael Coppola <m18coppola@gmail.com>
|
||||
Michael Hueschen <m@mhueschen.dev>
|
||||
Michael Kesper <mkesper@schokokeks.org>
|
||||
Michael Klimenko <mklimenko29@gmail.com>
|
||||
Michael Podvitskiy <podvitskiymichael@gmail.com>
|
||||
Michael Potter <NanoTekGuy@Gmail.com>
|
||||
Michaël de Vries <vriesdemichael@gmail.com>
|
||||
Mihai <mihai.chirculescu@yahoo.com>
|
||||
Mike <ytianhui2004@gmail.com>
|
||||
Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
|
||||
Mirko185 <mirkosig@gmail.com>
|
||||
Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
|
||||
Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
|
||||
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
|
||||
Murilo Santana <mvrilo@gmail.com>
|
||||
Musab Gultekin <musabgultekin@users.noreply.github.com>
|
||||
Nam D. Tran <42194884+namtranase@users.noreply.github.com>
|
||||
NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
|
||||
Nebula <infinitewormhole@gmail.com>
|
||||
Neo Zhang Jianyu <jianyu.zhang@intel.com>
|
||||
Neuman Vong <neuman.vong@gmail.com>
|
||||
Nexesenex <124105151+Nexesenex@users.noreply.github.com>
|
||||
Niall Coates <1349685+Niall-@users.noreply.github.com>
|
||||
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
|
||||
Nigel Bosch <pnigelb@gmail.com>
|
||||
Niklas Korz <niklas@niklaskorz.de>
|
||||
Nindaleth <Nindaleth@users.noreply.github.com>
|
||||
Oleksandr Nikitin <oleksandr@tvori.info>
|
||||
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
|
||||
Olivier Chafik <ochafik@users.noreply.github.com>
|
||||
Ondřej Čertík <ondrej@certik.us>
|
||||
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
|
||||
Paul Tsochantaris <ptsochantaris@icloud.com>
|
||||
Pavol Rusnak <pavol@rusnak.io>
|
||||
Pedro Cuenca <pedro@huggingface.co>
|
||||
Peter Sugihara <peter@campsh.com>
|
||||
Phil H <5756783+phiharri@users.noreply.github.com>
|
||||
Philip Taron <philip.taron@gmail.com>
|
||||
Phillip Kravtsov <phillip@kravtsov.net>
|
||||
Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
|
||||
Pierrick Hymbert <pierrick.hymbert@gmail.com>
|
||||
Przemysław Pawełczyk <przemoc@gmail.com>
|
||||
Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
|
||||
Qingyou Meng <meng.qingyou@gmail.com>
|
||||
Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
|
||||
RJ Adriaansen <adriaansen@eshcc.eur.nl>
|
||||
Radoslav Gerganov <rgerganov@gmail.com>
|
||||
Radosław Gryta <radek.gryta@gmail.com>
|
||||
Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com>
|
||||
Rand Xie <randxiexyy29@gmail.com>
|
||||
Randall Fitzgerald <randall@dasaku.net>
|
||||
Reinforce-II <fate@eastal.com>
|
||||
Riceball LEE <snowyu.lee@gmail.com>
|
||||
Richard Kiss <him@richardkiss.com>
|
||||
Richard Roberson <richardr1126@gmail.com>
|
||||
Rick G <26732651+TheFlipbook@users.noreply.github.com>
|
||||
Rickard Edén <rickardeden@gmail.com>
|
||||
Rickard Hallerbäck <rickard.hallerback@gmail.com>
|
||||
Rickey Bowers Jr <bitRAKE@gmail.com>
|
||||
Riley Stewart <ristew@users.noreply.github.com>
|
||||
Rinne <AsakusaRinne@gmail.com>
|
||||
Rinne <liu_yaohui1998@126.com>
|
||||
Robert Brisita <986796+rbrisita@users.noreply.github.com>
|
||||
Robert Sung-wook Shin <edp1096@users.noreply.github.com>
|
||||
Robey Holderith <robey@flaminglunchbox.net>
|
||||
Robyn <robyngraf@users.noreply.github.com>
|
||||
Roger Meier <r.meier@siemens.com>
|
||||
Roland <14355895+rbur0425@users.noreply.github.com>
|
||||
Romain D <90720+Artefact2@users.noreply.github.com>
|
||||
Romain Neutron <romain@neutron.io>
|
||||
Roman Parykin <donderom@gmail.com>
|
||||
Ron Evans <ron@hybridgroup.com>
|
||||
Ron Jailall <rojailal@gmail.com>
|
||||
Ronny Brendel <ronnybrendel@gmail.com>
|
||||
Ronsor <ronsor@ronsor.pw>
|
||||
Rowan Hart <rowanbhart@gmail.com>
|
||||
Rune <43761327+Rune-AI@users.noreply.github.com>
|
||||
Ryan Landay <rlanday@gmail.com>
|
||||
Ryder Wishart <ryderwishart@gmail.com>
|
||||
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
|
||||
SakuraUmi <yukinon244@gmail.com>
|
||||
Salvador E. Tropea <stropea@inti.gob.ar>
|
||||
Sam Spilsbury <smspillaz@gmail.com>
|
||||
Sami Farin <3876865+Safari77@users.noreply.github.com>
|
||||
Samuel Maynard <samwmaynard@gmail.com>
|
||||
Sang-Kil Park <sang.park@42dot.ai>
|
||||
Seb C <47074056+Sebby37@users.noreply.github.com>
|
||||
Sebastián A <sebastian.aedo29@gmail.com>
|
||||
SebastianApel <13675545+SebastianApel@users.noreply.github.com>
|
||||
Senemu <10880819+Senemu@users.noreply.github.com>
|
||||
Sergey Alirzaev <zl29ah@gmail.com>
|
||||
Sergio López <slp@sinrega.org>
|
||||
SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
|
||||
ShadovvBeast <ShadovvBeast@gmail.com>
|
||||
Shakhar Dasgupta <shakhardasgupta@gmail.com>
|
||||
Shangning Xu <32517059+xushangning@users.noreply.github.com>
|
||||
Shijie <821898965@qq.com>
|
||||
Shintarou Okada <kokuzen@gmail.com>
|
||||
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
|
||||
Shouzheng Liu <lshzh.hi@gmail.com>
|
||||
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
|
||||
Simon Willison <swillison@gmail.com>
|
||||
Siwen Yu <yusiwen@gmail.com>
|
||||
Sky Yan <skyan83@gmail.com>
|
||||
Slaren <2141330+slaren@users.noreply.github.com>
|
||||
Slava Primenko <primenko.s@gmail.com>
|
||||
SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
|
||||
Someone <sergei.kozlukov@aalto.fi>
|
||||
Someone Serge <sergei.kozlukov@aalto.fi>
|
||||
Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
|
||||
Spencer Sutton <spencersutton@users.noreply.github.com>
|
||||
Srinivas Billa <nivibilla@gmail.com>
|
||||
Stefan Sydow <stefan@sydow.email>
|
||||
Stephan Walter <stephan@walter.name>
|
||||
Stephen Nichols <snichols@users.noreply.github.com>
|
||||
Steve Grubb <ausearch.1@gmail.com>
|
||||
Steven Roussey <sroussey@gmail.com>
|
||||
Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
|
||||
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
|
||||
SuperUserNameMan <yoann@terminajones.com>
|
||||
Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
|
||||
Taikono-Himazin <kazu@po.harenet.ne.jp>
|
||||
Tameem <113388789+AhmadTameem@users.noreply.github.com>
|
||||
Tamotsu Takahashi <ttakah+github@gmail.com>
|
||||
Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com>
|
||||
Thatcher Chamberlin <j.thatcher.c@gmail.com>
|
||||
Theia Vogel <theia@vgel.me>
|
||||
Thérence <13496987+Royalphax@users.noreply.github.com>
|
||||
Thibault Terrasson <thibault.terrasson@gmail.com>
|
||||
Thomas Klausner <wiz@gatalith.at>
|
||||
Tim Miller <drasticactions@users.noreply.github.com>
|
||||
Timmy Knight <r2d2fish@gmail.com>
|
||||
Timothy Cronin <40186632+4imothy@users.noreply.github.com>
|
||||
Ting Lou <ting.lou@gmail.com>
|
||||
Ting Sun <suntcrick@gmail.com>
|
||||
Tobias Lütke <tobi@shopify.com>
|
||||
Tom C <tom.corelis@gmail.com>
|
||||
Tom Jobbins <784313+TheBloke@users.noreply.github.com>
|
||||
Tomas <tom.tomas.36478119@gmail.com>
|
||||
Tomáš Pazdiora <tomas.pazdiora@gmail.com>
|
||||
Tristan Ross <rosscomputerguy@protonmail.com>
|
||||
Tungsten842 <886724vf@anonaddy.me>
|
||||
Tungsten842 <quantmint@protonmail.com>
|
||||
Tushar <ditsuke@protonmail.com>
|
||||
UEXTM.com <84163508+uextm@users.noreply.github.com>
|
||||
Uzo Nweke <uzoechi@gmail.com>
|
||||
Vaibhav Srivastav <vaibhavs10@gmail.com>
|
||||
Val Kharitonov <mail@kharvd.com>
|
||||
Valentin Konovalov <valle.ketsujin@gmail.com>
|
||||
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
|
||||
Victor Z. Peng <ziliangdotme@gmail.com>
|
||||
Vlad <spitfireage@gmail.com>
|
||||
Vladimir <bogdad@gmail.com>
|
||||
Vladimir Malyutin <first-leon@yandex.ru>
|
||||
Vladimir Zorin <vladimir@deviant.guru>
|
||||
Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
|
||||
WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
|
||||
Weird Constructor <weirdconstructor@gmail.com>
|
||||
Welby Seely <welbyseely@gmail.com>
|
||||
Wentai Zhang <rchardx@gmail.com>
|
||||
WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
|
||||
Willy Tarreau <w@1wt.eu>
|
||||
Wu Jian Ping <wujjpp@hotmail.com>
|
||||
Wu Jian Ping <wujp@greatld.com>
|
||||
Xiake Sun <xiake.sun@intel.com>
|
||||
Xiang (Kevin) Li <kevinli020508@gmail.com>
|
||||
Xiao-Yong Jin <jinxiaoyong@gmail.com>
|
||||
XiaotaoChen <chenxiaotao1234@gmail.com>
|
||||
Xiaoyi Chen <cxychina@gmail.com>
|
||||
Xingchen Song(宋星辰) <xingchensong1996@163.com>
|
||||
Xuan Son Nguyen <thichthat@gmail.com>
|
||||
Yann Follet <131855179+YannFollet@users.noreply.github.com>
|
||||
Yiming Cui <conandiy@vip.qq.com>
|
||||
Yishuo Wang <MeouSker77@outlook.com>
|
||||
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
|
||||
Yui <dev@sleepyyui.com>
|
||||
Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
|
||||
Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
|
||||
ZHAOKAI WANG <sanxianwei@163.com>
|
||||
Zane Shannon <z@zcs.me>
|
||||
Zay <95888118+isaiahbjork@users.noreply.github.com>
|
||||
Zenix <zenixls2@gmail.com>
|
||||
Zhang Peiyuan <a1286225768@gmail.com>
|
||||
ZhouYuChen <zhouyuchen@naver.com>
|
||||
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
|
||||
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
|
||||
Zsapi <martin1.zsapka@gmail.com>
|
||||
a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
|
||||
adel boussaken <netdur@gmail.com>
|
||||
afrideva <95653597+afrideva@users.noreply.github.com>
|
||||
akawrykow <142945436+akawrykow@users.noreply.github.com>
|
||||
alexpinel <93524949+alexpinel@users.noreply.github.com>
|
||||
alonfaraj <alonfaraj@gmail.com>
|
||||
andrijdavid <david@geek.mg>
|
||||
anon998 <131767832+anon998@users.noreply.github.com>
|
||||
anzz1 <anzz1@live.com>
|
||||
apaz <aarpazdera@gmail.com>
|
||||
apcameron <37645737+apcameron@users.noreply.github.com>
|
||||
arcrank <arcrank@gmail.com>
|
||||
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
|
||||
at8u <129688334+at8u@users.noreply.github.com>
|
||||
automaticcat <daogiatuank54@gmail.com>
|
||||
bandoti <141645996+bandoti@users.noreply.github.com>
|
||||
beiller <beiller@gmail.com>
|
||||
bhubbb <79117352+bhubbb@users.noreply.github.com>
|
||||
bmwl <brian.marshall@tolko.com>
|
||||
bobqianic <129547291+bobqianic@users.noreply.github.com>
|
||||
bryanSwk <93190252+bryanSwk@users.noreply.github.com>
|
||||
bsilvereagle <bsilvereagle@users.noreply.github.com>
|
||||
bssrdf <merlintiger@hotmail.com>
|
||||
byte-6174 <88070277+byte-6174@users.noreply.github.com>
|
||||
cebtenzzre <cebtenzzre@gmail.com>
|
||||
chaihahaha <chai836275709@gmail.com>
|
||||
chiranko <96988916+chiranko@users.noreply.github.com>
|
||||
clibdev <52199778+clibdev@users.noreply.github.com>
|
||||
clyang <clyang@clyang.net>
|
||||
cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
|
||||
coezbek <c.oezbek@gmail.com>
|
||||
comex <comexk@gmail.com>
|
||||
compilade <113953597+compilade@users.noreply.github.com>
|
||||
crasm <crasm@git.vczf.net>
|
||||
crasm <crasm@git.vczf.us>
|
||||
daboe01 <daboe01@googlemail.com>
|
||||
david raistrick <keen99@users.noreply.github.com>
|
||||
ddpasa <112642920+ddpasa@users.noreply.github.com>
|
||||
deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
|
||||
divinity76 <divinity76@gmail.com>
|
||||
dotpy314 <33351922+dotpy314@users.noreply.github.com>
|
||||
drbh <david.richard.holtz@gmail.com>
|
||||
ds5t5 <145942675+ds5t5@users.noreply.github.com>
|
||||
dylan <canardleteer@users.noreply.github.com>
|
||||
eastriver <lee@eastriver.dev>
|
||||
ebraminio <ebraminio@gmail.com>
|
||||
eiery <19350831+eiery@users.noreply.github.com>
|
||||
eric8607242 <e0928021388@gmail.com>
|
||||
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
|
||||
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
|
||||
gliptic <gliptic@users.noreply.github.com>
|
||||
goerch <jhr.walter@t-online.de>
|
||||
grahameth <96447521+grahameth@users.noreply.github.com>
|
||||
gwjr <502526+gwjr@users.noreply.github.com>
|
||||
h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
|
||||
hankcs <cnhankmc@gmail.com>
|
||||
hoangmit <hoangmit@users.noreply.github.com>
|
||||
hongbo.mo <352280764@qq.com>
|
||||
howlger <eclipse@voormann.de>
|
||||
howlger <github@voormann.de>
|
||||
hutli <6594598+hutli@users.noreply.github.com>
|
||||
hutli <hutli@hutli.hu>
|
||||
hutli <jensstaermose@hotmail.com>
|
||||
hxer7963 <hxer7963@gmail.com>
|
||||
hydai <z54981220@gmail.com>
|
||||
iSma <ismail.senhaji@gmail.com>
|
||||
iacore <74560659+iacore@users.noreply.github.com>
|
||||
igarnier <igarnier@protonmail.com>
|
||||
iohub <rickyang.pro@gmail.com>
|
||||
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
|
||||
jameswu2014 <545426914@qq.com>
|
||||
jneem <joeneeman@gmail.com>
|
||||
johnson442 <56517414+johnson442@users.noreply.github.com>
|
||||
jon-chuang <9093549+jon-chuang@users.noreply.github.com>
|
||||
jp-x-g <jpxg-dev@protonmail.com>
|
||||
jwj7140 <32943891+jwj7140@users.noreply.github.com>
|
||||
kaizau <kaizau@users.noreply.github.com>
|
||||
kalomaze <66376113+kalomaze@users.noreply.github.com>
|
||||
kang <tpdns9032100@gmail.com>
|
||||
katsu560 <118887472+katsu560@users.noreply.github.com>
|
||||
kchro3 <62481661+kchro3@users.noreply.github.com>
|
||||
khimaros <me@khimaros.com>
|
||||
kiltyj <kiltyj@gmail.com>
|
||||
klosax <131523366+klosax@users.noreply.github.com>
|
||||
kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
|
||||
kunnis <kunnis@users.noreply.github.com>
|
||||
kuronekosaiko <EvanChanJ@163.com>
|
||||
kuvaus <22169537+kuvaus@users.noreply.github.com>
|
||||
kwin1412 <42286931+kwin1412@users.noreply.github.com>
|
||||
l3utterfly <gc.pthzfoldr@gmail.com>
|
||||
ldwang <ftgreat@163.com>
|
||||
le.chang <cljs118@126.com>
|
||||
leejet <leejet714@gmail.com>
|
||||
limitedAtonement <limitedAtonement@users.noreply.github.com>
|
||||
lon <114724657+longregen@users.noreply.github.com>
|
||||
m3ndax <adrian.goessl@outlook.com>
|
||||
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
|
||||
makomk <makosoft@googlemail.com>
|
||||
manikbhandari <mbbhandarimanik2@gmail.com>
|
||||
mdrokz <mohammadmunshi@gmail.com>
|
||||
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
|
||||
minarchist <minarchist@users.noreply.github.com>
|
||||
mj-shifu <77107165+mj-shifu@users.noreply.github.com>
|
||||
mmyjona <jonathan.gonse@gmail.com>
|
||||
momonga <115213907+mmnga@users.noreply.github.com>
|
||||
moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
|
||||
mzcu <milos.cubrilo@gmail.com>
|
||||
nanahi <130121847+na-na-hi@users.noreply.github.com>
|
||||
ngc92 <7938269+ngc92@users.noreply.github.com>
|
||||
nhamanasu <45545786+nhamanasu@users.noreply.github.com>
|
||||
niansa/tuxifan <anton-sa@web.de>
|
||||
niansa/tuxifan <tuxifan@posteo.de>
|
||||
ningshanwutuobang <ningshanwutuobang@gmail.com>
|
||||
nold <Nold360@users.noreply.github.com>
|
||||
nopperl <54780682+nopperl@users.noreply.github.com>
|
||||
nusu-github <29514220+nusu-github@users.noreply.github.com>
|
||||
olexiyb <olexiyb@gmail.com>
|
||||
oobabooga <112222186+oobabooga@users.noreply.github.com>
|
||||
opparco <parco.opaai@gmail.com>
|
||||
ostix360 <55257054+ostix360@users.noreply.github.com>
|
||||
perserk <perserk@gmail.com>
|
||||
postmasters <namnguyen@google.com>
|
||||
pudepiedj <pudepiedj@gmail.com>
|
||||
qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
|
||||
qouoq <qouoq@fastmail.com>
|
||||
qunash <anzoria@gmail.com>
|
||||
rabidcopy <rabidcopy@yahoo.com>
|
||||
rankaiyx <rankaiyx@rankaiyx.com>
|
||||
rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com>
|
||||
rhuddleston <ryan.huddleston@percona.com>
|
||||
rimoliga <53384203+rimoliga@users.noreply.github.com>
|
||||
runfuture <runfuture@users.noreply.github.com>
|
||||
sandyiscool <sandyiscool@gmail.com>
|
||||
semidark <me@semidark.net>
|
||||
sharpHL <132747147+sharpHL@users.noreply.github.com>
|
||||
shibe2 <shibe@tuta.io>
|
||||
singularity <12184989+singularity-s0@users.noreply.github.com>
|
||||
sjinzh <sjinzh@gmail.com>
|
||||
slaren <2141330+slaren@users.noreply.github.com>
|
||||
slaren <slarengh@gmail.com>
|
||||
snadampal <87143774+snadampal@users.noreply.github.com>
|
||||
staviq <staviq@gmail.com>
|
||||
stduhpf <stephduh@live.fr>
|
||||
swittk <switt1995@gmail.com>
|
||||
takov751 <40316768+takov751@users.noreply.github.com>
|
||||
tarcey <cey.tarik@gmail.com>
|
||||
texmex76 <40733439+texmex76@users.noreply.github.com>
|
||||
thement <40525767+thement@users.noreply.github.com>
|
||||
tjohnman <tjohnman@users.noreply.github.com>
|
||||
tslmy <tslmy@users.noreply.github.com>
|
||||
ubik2 <ubik2@users.noreply.github.com>
|
||||
uint256_t <konndennsa@gmail.com>
|
||||
uint256_t <maekawatoshiki1017@gmail.com>
|
||||
unbounded <haakon@likedan.net>
|
||||
valiray <133289098+valiray@users.noreply.github.com>
|
||||
vodkaslime <646329483@qq.com>
|
||||
vvhg1 <94630311+vvhg1@users.noreply.github.com>
|
||||
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
|
||||
wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
|
||||
whoreson <139810751+whoreson@users.noreply.github.com>
|
||||
wonjun Jang <strutive07@gmail.com>
|
||||
wzy <32936898+Freed-Wu@users.noreply.github.com>
|
||||
xaedes <xaedes@gmail.com>
|
||||
xaedes <xaedes@googlemail.com>
|
||||
xloem <0xloem@gmail.com>
|
||||
yangli2 <yangli2@gmail.com>
|
||||
yuiseki <yuiseki@gmail.com>
|
||||
zakkor <edward.partenie@gmail.com>
|
||||
zhouwg <6889919+zhouwg@users.noreply.github.com>
|
||||
zrm <trustiosity.zrm@gmail.com>
|
||||
源文雨 <41315874+fumiama@users.noreply.github.com>
|
||||
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
|
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2023 Georgi Gerganov
|
||||
Copyright (c) 2023-2024 The ggml authors
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
|
13
README.md
13
README.md
|
@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
|||
|
||||
### Recent API changes
|
||||
|
||||
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
|
||||
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
|
||||
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
|
||||
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
|
||||
|
@ -120,6 +121,9 @@ Typically finetunes of the base models below are supported as well.
|
|||
- [x] [Xverse](https://huggingface.co/models?search=xverse)
|
||||
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
|
||||
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
||||
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
||||
|
||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
|
||||
|
||||
**Multimodal models:**
|
||||
|
||||
|
@ -183,7 +187,8 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
|||
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
|
||||
- [Dot](https://github.com/alexpinel/Dot) (GPL)
|
||||
- [MindMac](https://mindmac.app) (proprietary)
|
||||
|
||||
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
||||
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||
|
||||
---
|
||||
|
@ -494,7 +499,7 @@ Building the program with BLAS support may lead to some performance improvements
|
|||
|
||||
This provides BLAS acceleration on HIP-supported AMD GPUs.
|
||||
Make sure to have ROCm installed.
|
||||
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
|
||||
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
|
||||
|
||||
- Using `make`:
|
||||
```bash
|
||||
|
@ -511,7 +516,7 @@ Building the program with BLAS support may lead to some performance improvements
|
|||
|
||||
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
||||
```bash
|
||||
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030
|
||||
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
|
||||
```
|
||||
|
||||
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
||||
|
@ -519,7 +524,7 @@ Building the program with BLAS support may lead to some performance improvements
|
|||
set PATH=%HIP_PATH%\bin;%PATH%
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
|
||||
cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release ..
|
||||
cmake --build .
|
||||
```
|
||||
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include <unordered_set>
|
||||
#include <vector>
|
||||
#include <cinttypes>
|
||||
#include <codecvt>
|
||||
|
||||
#if defined(__APPLE__) && defined(__MACH__)
|
||||
#include <sys/types.h>
|
||||
|
@ -27,7 +28,6 @@
|
|||
#ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
#endif
|
||||
#include <codecvt>
|
||||
#include <locale>
|
||||
#include <windows.h>
|
||||
#include <fcntl.h>
|
||||
|
@ -1500,6 +1500,77 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
|
|||
GGML_UNREACHABLE();
|
||||
}
|
||||
|
||||
// Validate if a filename is safe to use
|
||||
// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
|
||||
bool validate_file_name(const std::string & filename) {
|
||||
if (!filename.length()) {
|
||||
// Empty filename invalid
|
||||
return false;
|
||||
}
|
||||
if (filename.length() > 255) {
|
||||
// Limit at common largest possible filename on Linux filesystems
|
||||
// to avoid unnecessary further validation
|
||||
// (On systems with smaller limits it will be caught by the OS)
|
||||
return false;
|
||||
}
|
||||
|
||||
std::u32string filename_utf32;
|
||||
try {
|
||||
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
||||
filename_utf32 = converter.from_bytes(filename);
|
||||
|
||||
// If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
|
||||
// or invalid encodings were encountered. Reject such attempts
|
||||
std::string filename_reencoded = converter.to_bytes(filename_utf32);
|
||||
if (filename_reencoded != filename) {
|
||||
return false;
|
||||
}
|
||||
} catch (const std::exception &) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check for forbidden codepoints:
|
||||
// - Control characters
|
||||
// - Unicode equivalents of illegal characters
|
||||
// - UTF-16 surrogate pairs
|
||||
// - UTF-8 replacement character
|
||||
// - Byte order mark (BOM)
|
||||
// - Illegal characters: / \ : * ? " < > |
|
||||
for (char32_t c : filename_utf32) {
|
||||
if (c <= 0x1F // Control characters (C0)
|
||||
|| c == 0x7F // Control characters (DEL)
|
||||
|| (c >= 0x80 && c <= 0x9F) // Control characters (C1)
|
||||
|| c == 0xFF0E // Fullwidth Full Stop (period equivalent)
|
||||
|| c == 0x2215 // Division Slash (forward slash equivalent)
|
||||
|| c == 0x2216 // Set Minus (backslash equivalent)
|
||||
|| (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
|
||||
|| c == 0xFFFD // Replacement Character (UTF-8)
|
||||
|| c == 0xFEFF // Byte Order Mark (BOM)
|
||||
|| c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
|
||||
|| c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
|
||||
// Unicode and other whitespace is not affected, only 0x20 space
|
||||
if (filename.front() == ' ' || filename.back() == ' ' || filename.back() == '.') {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead)
|
||||
if (filename.find("..") != std::string::npos) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Reject "."
|
||||
if (filename == ".") {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// String utils
|
||||
//
|
||||
|
@ -2141,23 +2212,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|||
std::vector<llama_token> llama_tokenize(
|
||||
const struct llama_context * ctx,
|
||||
const std::string & text,
|
||||
bool add_bos,
|
||||
bool special) {
|
||||
return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
|
||||
bool add_special,
|
||||
bool parse_special) {
|
||||
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
|
||||
}
|
||||
|
||||
std::vector<llama_token> llama_tokenize(
|
||||
const struct llama_model * model,
|
||||
const std::string & text,
|
||||
bool add_bos,
|
||||
bool special) {
|
||||
bool add_special,
|
||||
bool parse_special) {
|
||||
// upper limit for the number of tokens
|
||||
int n_tokens = text.length() + add_bos;
|
||||
int n_tokens = text.length() + 2 * add_special;
|
||||
std::vector<llama_token> result(n_tokens);
|
||||
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
|
||||
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
|
||||
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
|
|
|
@ -179,6 +179,8 @@ std::string gpt_random_prompt(std::mt19937 & rng);
|
|||
|
||||
void process_escapes(std::string& input);
|
||||
|
||||
bool validate_file_name(const std::string & filename);
|
||||
|
||||
//
|
||||
// String utils
|
||||
//
|
||||
|
@ -221,14 +223,14 @@ void llama_batch_add(
|
|||
std::vector<llama_token> llama_tokenize(
|
||||
const struct llama_context * ctx,
|
||||
const std::string & text,
|
||||
bool add_bos,
|
||||
bool special = false);
|
||||
bool add_special,
|
||||
bool parse_special = false);
|
||||
|
||||
std::vector<llama_token> llama_tokenize(
|
||||
const struct llama_model * model,
|
||||
const std::string & text,
|
||||
bool add_bos,
|
||||
bool special = false);
|
||||
bool add_special,
|
||||
bool parse_special = false);
|
||||
|
||||
// tokenizes a token into a piece
|
||||
// should work similar to Python's `tokenizer.id_to_piece`
|
||||
|
|
|
@ -129,7 +129,7 @@ llama_token llama_sampling_sample(
|
|||
struct llama_sampling_context * ctx_sampling,
|
||||
struct llama_context * ctx_main,
|
||||
struct llama_context * ctx_cfg,
|
||||
int idx = 0);
|
||||
int idx = -1);
|
||||
|
||||
// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
|
||||
llama_token_data_array llama_sampling_prepare(
|
||||
|
|
|
@ -160,7 +160,7 @@ class Model(ABC):
|
|||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
|
@ -227,15 +227,14 @@ class Model(ABC):
|
|||
return ("pytorch_model.bin",)
|
||||
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
|
||||
|
||||
def _set_vocab_gpt2(self):
|
||||
dir_model = self.dir_model
|
||||
hparams = self.hparams
|
||||
# used for GPT-2 BPE and WordPiece vocabs
|
||||
def get_basic_vocab(self) -> tuple[list[str], list[int]]:
|
||||
tokens: list[str] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
|
||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
||||
|
@ -255,11 +254,15 @@ class Model(ABC):
|
|||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
return tokens, toktypes
|
||||
|
||||
def _set_vocab_gpt2(self) -> None:
|
||||
tokens, toktypes = self.get_basic_vocab()
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_qwen(self):
|
||||
|
@ -2202,34 +2205,25 @@ class BertModel(Model):
|
|||
self.gguf_writer.add_pooling_type(pooling_type)
|
||||
|
||||
def set_vocab(self):
|
||||
# use huggingface vocab to get all tokens
|
||||
vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
|
||||
tokens, scores, toktypes = zip(*vocab.all_tokens())
|
||||
assert len(tokens) == vocab.vocab_size
|
||||
self.vocab_size = vocab.vocab_size
|
||||
tokens, toktypes = self.get_basic_vocab()
|
||||
self.vocab_size = len(tokens)
|
||||
|
||||
# we need this to validate the size of the token_type embeddings
|
||||
# though currently we are passing all zeros to the token_type embeddings
|
||||
n_token_types = len(set(toktypes))
|
||||
self.gguf_writer.add_token_type_count(n_token_types)
|
||||
self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B"
|
||||
|
||||
# convert to phantom space vocab
|
||||
def phantom(tok, typ):
|
||||
if tok.startswith(b"[") and tok.endswith(b"]"):
|
||||
def phantom(tok):
|
||||
if tok.startswith("[") and tok.endswith("]"):
|
||||
return tok
|
||||
if tok.startswith(b"##"):
|
||||
if tok.startswith("##"):
|
||||
return tok[2:]
|
||||
return b"\xe2\x96\x81" + tok
|
||||
tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
|
||||
|
||||
# set up bos and eos tokens (cls and sep)
|
||||
self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
|
||||
self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id)
|
||||
return "\u2581" + tok
|
||||
tokens = list(map(phantom, tokens))
|
||||
|
||||
# add vocab to gguf
|
||||
self.gguf_writer.add_tokenizer_model("bert")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
# handle special tokens
|
||||
|
@ -2301,16 +2295,6 @@ class NomicBertModel(BertModel):
|
|||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
||||
|
||||
def get_tensors(self):
|
||||
assert self.vocab_size is not None
|
||||
for name, data in super().get_tensors():
|
||||
# Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
|
||||
if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
|
||||
rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
|
||||
assert data.shape == (rounded_vocab_size, self.hparams["n_embd"])
|
||||
data = data[:self.vocab_size, :]
|
||||
yield name, data
|
||||
|
||||
|
||||
@Model.register("GemmaForCausalLM")
|
||||
class GemmaModel(Model):
|
||||
|
@ -2486,7 +2470,8 @@ class MambaModel(Model):
|
|||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert big float32 2-dim weight tensors to float16
|
||||
if self.ftype == 1 and data_dtype == np.float32 and new_name.removesuffix(".weight").endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
|
||||
new_weight_name = new_name[:-len(".weight")] if new_name.endswith(".weight") else ""
|
||||
if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
|
26
convert.py
26
convert.py
|
@ -33,7 +33,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
|||
import gguf
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import TypeAlias
|
||||
from typing_extensions import Self, TypeAlias
|
||||
|
||||
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
|
||||
faulthandler.register(signal.SIGUSR1)
|
||||
|
@ -139,7 +139,8 @@ class GGMLFileType(enum.IntEnum):
|
|||
dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
|
||||
if dt is None:
|
||||
raise ValueError(self)
|
||||
# 1D tensors are always F32.
|
||||
# Convert all 1D tensors to F32. Most of the codebase that takes in 1D tensors only handles F32 tensors, and most of the outputs tensors are F32.
|
||||
# Also The 1d tensors aren't much of a performance/size issue. So instead of having to have separate F32 and F16 implementations of both, just convert everything to F32 for now.
|
||||
return dt if len(tensor.shape) > 1 else DT_F32
|
||||
|
||||
|
||||
|
@ -516,7 +517,7 @@ class LlamaHfVocab(Vocab):
|
|||
tokenizer_model = "llama"
|
||||
name = "hfft"
|
||||
|
||||
def __init__(self, base_path: Path, ignore_nonllama: bool = False):
|
||||
def __init__(self, base_path: Path):
|
||||
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
||||
# if this fails, FileNotFoundError propagates to caller
|
||||
with open(fname_tokenizer, encoding='utf-8') as f:
|
||||
|
@ -524,9 +525,7 @@ class LlamaHfVocab(Vocab):
|
|||
|
||||
# pre-check so we know if we need transformers
|
||||
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
||||
if ignore_nonllama:
|
||||
pass # workaround incorrect use of this class for WordPiece
|
||||
elif (
|
||||
if (
|
||||
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
|
||||
or tokenizer_json['decoder']['type'] != 'Sequence'
|
||||
):
|
||||
|
@ -646,16 +645,17 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
|||
|
||||
|
||||
class Tensor(ABC):
|
||||
ndarray: NDArray
|
||||
data_type: DataType
|
||||
|
||||
@abstractmethod
|
||||
def astype(self, data_type: DataType) -> Tensor: ...
|
||||
def astype(self, data_type: DataType) -> Self: ...
|
||||
@abstractmethod
|
||||
def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
|
||||
def permute(self, n_head: int, n_head_kv: int) -> Self: ...
|
||||
@abstractmethod
|
||||
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
|
||||
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> Self: ...
|
||||
@abstractmethod
|
||||
def part(self, n_part: int) -> UnquantizedTensor: ...
|
||||
def part(self, n_part: int) -> Self: ...
|
||||
@abstractmethod
|
||||
def to_ggml(self) -> GGMLCompatibleTensor: ...
|
||||
|
||||
|
@ -672,13 +672,13 @@ class UnquantizedTensor(Tensor):
|
|||
self.ndarray = ndarray
|
||||
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
|
||||
|
||||
def astype(self, data_type: DataType) -> Tensor:
|
||||
def astype(self, data_type: DataType) -> UnquantizedTensor:
|
||||
dtype = data_type.dtype
|
||||
if self.data_type == DT_BF16:
|
||||
self.ndarray = bf16_to_fp32(self.ndarray)
|
||||
return UnquantizedTensor(self.ndarray.astype(dtype))
|
||||
|
||||
def to_ggml(self) -> UnquantizedTensor:
|
||||
def to_ggml(self) -> Self:
|
||||
return self
|
||||
|
||||
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
|
||||
|
@ -1350,7 +1350,7 @@ def load_some_model(path: Path) -> ModelPlus:
|
|||
# Be extra-friendly and accept either a file or a directory:
|
||||
if path.is_dir():
|
||||
# Check if it's a set of safetensors files first
|
||||
globs = ["model-00001-of-*.safetensors", "model.safetensors"]
|
||||
globs = ["model-00001-of-*.safetensors", "model.safetensors", "consolidated.safetensors"]
|
||||
files = [file for glob in globs for file in path.glob(glob)]
|
||||
if not files:
|
||||
# Try the PyTorch patterns too, with lower priority
|
||||
|
|
117
docs/HOWTO-add-model.md
Normal file
117
docs/HOWTO-add-model.md
Normal file
|
@ -0,0 +1,117 @@
|
|||
## Add a new model architecture to `llama.cpp`
|
||||
|
||||
Adding a model requires few steps:
|
||||
|
||||
1. Convert the model to GGUF
|
||||
2. Define the model architecture in `llama.cpp`
|
||||
3. Build the GGML graph implementation
|
||||
|
||||
After following these steps, you can open PR.
|
||||
|
||||
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
|
||||
- [main](../examples/main)
|
||||
- [imatrix](../examples/imatrix)
|
||||
- [quantize](../examples/quantize)
|
||||
- [server](../examples/server)
|
||||
|
||||
### 1. Convert the model to GGUF
|
||||
|
||||
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
|
||||
Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
|
||||
|
||||
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
|
||||
|
||||
The required steps to implement for an HF model are:
|
||||
|
||||
1. Define the model `Model.register` annotation in a new `Model` subclass, example:
|
||||
|
||||
```python
|
||||
@Model.register("MyModelForCausalLM")
|
||||
class MyModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.GROK
|
||||
```
|
||||
|
||||
2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py)
|
||||
|
||||
Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.
|
||||
|
||||
Example for `falcon` model:
|
||||
```python
|
||||
MODEL_ARCH.FALCON: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_NORM_2,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
]
|
||||
```
|
||||
|
||||
3. Map the original tensor names to the standardize equivalent in GGUF
|
||||
|
||||
As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.
|
||||
|
||||
Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file.
|
||||
|
||||
If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.
|
||||
|
||||
Example for the normalization tensor in attention layers:
|
||||
|
||||
```python
|
||||
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||
# Attention norm
|
||||
MODEL_TENSOR.ATTN_NORM: (
|
||||
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
||||
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen
|
||||
"transformer.blocks.{bid}.norm_1", # mpt
|
||||
...
|
||||
)
|
||||
}
|
||||
```
|
||||
|
||||
`transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF.
|
||||
|
||||
Depending on the model configuration, tokenizer, code and tensors layout, you will have to override:
|
||||
- `Model#set_gguf_parameters`
|
||||
- `Model#set_vocab`
|
||||
- `Model#write_tensors`
|
||||
|
||||
NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights.
|
||||
|
||||
### 2. Define the model architecture in `llama.cpp`
|
||||
|
||||
The model params and tensors layout must be defined in `llama.cpp`:
|
||||
1. Define a new `llm_arch`
|
||||
2. Define the tensors layout in `LLM_TENSOR_NAMES`
|
||||
3. Add any non standard metadata in `llm_load_hparams`
|
||||
4. Create the tensors for inference in `llm_load_tensors`
|
||||
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
|
||||
|
||||
NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
|
||||
|
||||
### 3. Build the GGML graph implementation
|
||||
|
||||
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
|
||||
|
||||
Have a look to existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
|
||||
|
||||
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.
|
||||
|
||||
## GGUF specification
|
||||
|
||||
https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
|
||||
|
||||
## Resources
|
||||
|
||||
- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
|
||||
- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
|
||||
- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
|
||||
- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
|
||||
- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
|
||||
- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
|
||||
- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
|
||||
- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
|
||||
- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948
|
|
@ -123,10 +123,10 @@ int main(int argc, char ** argv) {
|
|||
inputs.push_back(inp);
|
||||
}
|
||||
|
||||
// add eos if not present
|
||||
// add SEP if not present
|
||||
for (auto & inp : inputs) {
|
||||
if (inp.empty() || inp.back() != llama_token_eos(model)) {
|
||||
inp.push_back(llama_token_eos(model));
|
||||
if (inp.empty() || inp.back() != llama_token_sep(model)) {
|
||||
inp.push_back(llama_token_sep(model));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -349,12 +349,13 @@ static void process_logits(
|
|||
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
|
||||
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||
|
|
|
@ -239,6 +239,7 @@ int main(int argc, char ** argv) {
|
|||
LOG_TEE("%s\n", get_system_info(params).c_str());
|
||||
}
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||
LOG("add_bos: %d\n", add_bos);
|
||||
|
||||
bool suff_rm_leading_spc = params.escape;
|
||||
|
@ -279,10 +280,10 @@ int main(int argc, char ** argv) {
|
|||
if (ctx_guidance) {
|
||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
||||
|
||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
|
||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
|
||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
||||
|
||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
||||
|
||||
original_prompt_len = original_inp.size();
|
||||
|
|
|
@ -146,7 +146,6 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||
int n_past = 0;
|
||||
|
||||
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
|
||||
|
||||
std::string system_prompt, user_prompt;
|
||||
size_t image_pos = prompt.find("<image>");
|
||||
|
@ -180,7 +179,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||
}
|
||||
}
|
||||
|
||||
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
|
||||
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
|
||||
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
|
||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||
|
||||
|
|
|
@ -64,13 +64,10 @@ int main(int argc, char ** argv) {
|
|||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
|
||||
// Tokenize the prompt
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
LOG("add_bos tgt: %d\n", add_bos);
|
||||
|
||||
std::vector<llama_token> inp;
|
||||
std::vector<llama_token> all;
|
||||
|
||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||
all = inp;
|
||||
|
||||
const int max_context_size = llama_n_ctx(ctx);
|
||||
|
|
|
@ -28,10 +28,8 @@ int main(int argc, char ** argv){
|
|||
GGML_ASSERT(model != nullptr);
|
||||
|
||||
// tokenize the prompt
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||
fprintf(stderr, "%s: tokenization done\n", __func__);
|
||||
|
||||
|
||||
|
|
|
@ -34,11 +34,8 @@ int main(int argc, char ** argv){
|
|||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||
|
||||
// tokenize the prompt
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
LOG("add_bos tgt: %d\n", add_bos);
|
||||
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||
|
||||
llama_ngram_cache ngram_cache_context;
|
||||
llama_ngram_cache ngram_cache_dynamic;
|
||||
|
|
|
@ -42,11 +42,8 @@ int main(int argc, char ** argv){
|
|||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||
|
||||
// tokenize the prompt
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
LOG("add_bos tgt: %d\n", add_bos);
|
||||
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||
|
||||
llama_ngram_cache ngram_cache_context;
|
||||
llama_ngram_cache ngram_cache_dynamic;
|
||||
|
|
|
@ -235,7 +235,7 @@ int main(int argc, char ** argv) {
|
|||
// The file exists and is not empty
|
||||
session_tokens.resize(n_ctx);
|
||||
size_t n_token_count_out = 0;
|
||||
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
||||
if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
||||
LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
@ -246,6 +246,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||
LOG("add_bos: %d\n", add_bos);
|
||||
|
||||
std::vector<llama_token> embd_inp;
|
||||
|
@ -255,7 +256,7 @@ int main(int argc, char ** argv) {
|
|||
if (params.chatml) {
|
||||
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
|
||||
}
|
||||
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||
} else {
|
||||
LOG("use session tokens\n");
|
||||
embd_inp = session_tokens;
|
||||
|
@ -277,10 +278,10 @@ int main(int argc, char ** argv) {
|
|||
if (ctx_guidance) {
|
||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
||||
|
||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
|
||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
|
||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
||||
|
||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
||||
|
||||
original_prompt_len = original_inp.size();
|
||||
|
@ -339,14 +340,14 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// prefix & suffix for instruct mode
|
||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
|
||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
|
||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true);
|
||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
|
||||
|
||||
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
||||
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
||||
|
||||
// chatml prefix & suffix
|
||||
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true);
|
||||
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
|
||||
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
|
||||
|
||||
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
|
||||
|
@ -693,7 +694,7 @@ int main(int argc, char ** argv) {
|
|||
// optionally save the session on first sample (for faster prompt loading next time)
|
||||
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
|
||||
need_to_save_session = false;
|
||||
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||
|
||||
LOG("saved session to %s\n", path_session.c_str());
|
||||
}
|
||||
|
@ -935,7 +936,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
|
||||
LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
|
||||
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||
}
|
||||
|
||||
llama_print_timings(ctx);
|
||||
|
|
|
@ -315,10 +315,11 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|||
// BOS tokens will be added for each chunk before eval
|
||||
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||
|
||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
|
@ -454,6 +455,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||
// BOS tokens will be added for each chunk before eval
|
||||
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||
|
||||
std::ofstream logits_stream;
|
||||
if (!params.logits_file.empty()) {
|
||||
|
@ -470,7 +472,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||
|
@ -771,9 +773,6 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
|
||||
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
|
||||
|
||||
// This is needed as usual for LLaMA models
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
|
||||
// The tasks should be randomized so the score stabilizes quickly.
|
||||
bool randomize_tasks = true;
|
||||
|
||||
|
@ -818,7 +817,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||
hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
||||
for (size_t j = 0; j < 4; j++) {
|
||||
hs_cur.ending[j] = prompt_lines[idx*6+2+j];
|
||||
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], add_bos);
|
||||
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
|
||||
}
|
||||
|
||||
// determine the common prefix of the endings
|
||||
|
@ -837,7 +836,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||
hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
|
||||
hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
|
||||
|
||||
//GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, add_bos).size());
|
||||
//GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, true).size());
|
||||
|
||||
// Delete the selected random example from the prompt
|
||||
if (randomize_tasks) {
|
||||
|
@ -1110,12 +1109,9 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||
|
||||
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
|
||||
|
||||
// This is needed as usual for LLaMA models
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
|
||||
for (auto & task : data) {
|
||||
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, add_bos);
|
||||
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, add_bos);
|
||||
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
|
||||
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true);
|
||||
|
||||
task.common_prefix = 0;
|
||||
for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
|
||||
|
@ -1130,8 +1126,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||
task.seq_tokens[0].size() - task.common_prefix +
|
||||
task.seq_tokens[1].size() - task.common_prefix;
|
||||
|
||||
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], add_bos).size();
|
||||
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], add_bos).size();
|
||||
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size();
|
||||
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
|
||||
|
@ -1322,7 +1318,7 @@ struct multiple_choice_task {
|
|||
std::vector<float> log_probs;
|
||||
};
|
||||
|
||||
static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) {
|
||||
static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
|
||||
if (task.question.empty() || task.mc1.answers.empty()) {
|
||||
if (log_error) {
|
||||
printf("%s: found bad task with empty question and/or answers\n", __func__);
|
||||
|
@ -1337,7 +1333,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos,
|
|||
}
|
||||
return false;
|
||||
}
|
||||
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos));
|
||||
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true));
|
||||
}
|
||||
auto min_len = task.seq_tokens.front().size();
|
||||
for (auto& seq : task.seq_tokens) {
|
||||
|
@ -1436,9 +1432,6 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||
n_task = params.multiple_choice_tasks;
|
||||
}
|
||||
|
||||
// This is needed as usual for LLaMA models
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
|
||||
printf("%s: preparing task data", __func__);
|
||||
fflush(stdout);
|
||||
if (n_task > 500) {
|
||||
|
@ -1446,7 +1439,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||
fflush(stdout);
|
||||
std::atomic<int> counter(0);
|
||||
std::atomic<int> n_bad(0);
|
||||
auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () {
|
||||
auto prepare = [&counter, &n_bad, &tasks, ctx] () {
|
||||
int num_tasks = tasks.size();
|
||||
int n_bad_local = 0;
|
||||
while (true) {
|
||||
|
@ -1457,7 +1450,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||
}
|
||||
int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
|
||||
for (int i = first; i < last; ++i) {
|
||||
if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local;
|
||||
if (!multiple_choice_prepare_one_task(ctx, tasks[i], false)) ++n_bad_local;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -1479,7 +1472,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||
int i_task = 0;
|
||||
for (auto& task : tasks) {
|
||||
++i_task;
|
||||
if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) {
|
||||
if (!multiple_choice_prepare_one_task(ctx, task, true)) {
|
||||
return;
|
||||
}
|
||||
if (i_task%n_dot == 0) {
|
||||
|
@ -1715,6 +1708,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|||
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
|
||||
const int nv = 2*((n_vocab + 1)/2) + 4;
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||
|
||||
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
|
||||
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
|
||||
|
|
|
@ -24,6 +24,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
std::string result0;
|
||||
std::string result1;
|
||||
std::string result2;
|
||||
|
||||
// init
|
||||
llama_model * model;
|
||||
|
@ -44,8 +45,8 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// save state (rng, logits, embedding and kv_cache) to file
|
||||
{
|
||||
std::vector<uint8_t> state_mem(llama_get_state_size(ctx));
|
||||
const size_t written = llama_copy_state_data(ctx, state_mem.data());
|
||||
std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
|
||||
const size_t written = llama_state_get_data(ctx, state_mem.data());
|
||||
|
||||
FILE *fp_write = fopen("dump_state.bin", "wb");
|
||||
fwrite(state_mem.data(), 1, written, fp_write);
|
||||
|
@ -97,13 +98,13 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// load state (rng, logits, embedding and kv_cache) from file
|
||||
{
|
||||
std::vector<uint8_t> state_mem(llama_get_state_size(ctx2));
|
||||
std::vector<uint8_t> state_mem(llama_state_get_size(ctx2));
|
||||
|
||||
FILE * fp_read = fopen("dump_state.bin", "rb");
|
||||
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
|
||||
fclose(fp_read);
|
||||
|
||||
if (read != llama_set_state_data(ctx2, state_mem.data())) {
|
||||
if (read != llama_state_set_data(ctx2, state_mem.data())) {
|
||||
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
||||
llama_free(ctx2);
|
||||
llama_free_model(model);
|
||||
|
@ -141,16 +142,104 @@ int main(int argc, char ** argv) {
|
|||
n_past += 1;
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
printf("\n\n");
|
||||
|
||||
llama_free(ctx2);
|
||||
llama_free_model(model);
|
||||
|
||||
if (result0 != result1) {
|
||||
fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// make new context
|
||||
auto* ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
||||
|
||||
printf("\nsingle seq run: %s", params.prompt.c_str());
|
||||
|
||||
// load state (rng, logits, embedding and kv_cache) from file
|
||||
{
|
||||
std::vector<uint8_t> state_mem(llama_state_get_size(ctx3));
|
||||
|
||||
FILE * fp_read = fopen("dump_state.bin", "rb");
|
||||
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
|
||||
fclose(fp_read);
|
||||
|
||||
if (read != llama_state_set_data(ctx3, state_mem.data())) {
|
||||
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
||||
llama_free(ctx3);
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
|
||||
}
|
||||
|
||||
// restore state (last tokens)
|
||||
n_past = n_past_saved;
|
||||
|
||||
// save seq 0 and load into seq 1
|
||||
{
|
||||
// save kv of seq 0
|
||||
std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0));
|
||||
const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), 0);
|
||||
if (ncopy != seq_store.size()) {
|
||||
fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
|
||||
llama_free(ctx3);
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
|
||||
|
||||
// erase whole kv
|
||||
llama_kv_cache_clear(ctx3);
|
||||
fprintf(stderr, "%s : kv cache cleared\n", __func__);
|
||||
|
||||
// restore kv into seq 1
|
||||
const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), 1);
|
||||
if (nset != seq_store.size()) {
|
||||
fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
|
||||
llama_free(ctx3);
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
|
||||
}
|
||||
|
||||
// third run with seq 1 instead of 0
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto * logits = llama_get_logits(ctx3);
|
||||
auto n_vocab = llama_n_vocab(model);
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||
}
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
auto next_token = llama_sample_token(ctx3, &candidates_p);
|
||||
auto next_token_str = llama_token_to_piece(ctx3, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result2 += next_token_str;
|
||||
|
||||
if (llama_decode(ctx3, llama_batch_get_one(&next_token, 1, n_past, 1))) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
llama_free(ctx3);
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
llama_free(ctx3);
|
||||
llama_free_model(model);
|
||||
|
||||
if (result0 != result2) {
|
||||
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n%s : success\n", __func__);
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -57,6 +57,7 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
|
|||
- `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1`
|
||||
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
|
||||
- `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled
|
||||
- `--slot-save-path PATH`: Specifies the path where the state of slots (the prompt cache) can be stored. If not provided, the slot management endpoints will be disabled.
|
||||
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||
- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
|
||||
- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
|
||||
|
@ -517,6 +518,57 @@ Available metrics:
|
|||
- `llamacpp:requests_processing`: Number of requests processing.
|
||||
- `llamacpp:requests_deferred`: Number of requests deferred.
|
||||
|
||||
- **POST** `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
|
||||
|
||||
*Options:*
|
||||
|
||||
`filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.
|
||||
|
||||
### Result JSON
|
||||
|
||||
```json
|
||||
{
|
||||
"id_slot": 0,
|
||||
"filename": "slot_save_file.bin",
|
||||
"n_saved": 1745,
|
||||
"n_written": 14309796,
|
||||
"timings": {
|
||||
"save_ms": 49.865
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
- **POST** `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file.
|
||||
|
||||
*Options:*
|
||||
|
||||
`filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.
|
||||
|
||||
### Result JSON
|
||||
|
||||
```json
|
||||
{
|
||||
"id_slot": 0,
|
||||
"filename": "slot_save_file.bin",
|
||||
"n_restored": 1745,
|
||||
"n_read": 14309796,
|
||||
"timings": {
|
||||
"restore_ms": 42.937
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
- **POST** `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot.
|
||||
|
||||
### Result JSON
|
||||
|
||||
```json
|
||||
{
|
||||
"id_slot": 0,
|
||||
"n_erased": 1745
|
||||
}
|
||||
```
|
||||
|
||||
## More examples
|
||||
|
||||
### Change system prompt on runtime
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -51,6 +51,26 @@
|
|||
margin-bottom: 0.5em;
|
||||
}
|
||||
|
||||
button, input, textarea, .button, a.button, select {
|
||||
color: #666;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 4px;
|
||||
line-height: 1.5em;
|
||||
padding: 0.25em 0.25em;
|
||||
text-decoration: none;
|
||||
font-size: 1.1rem;
|
||||
}
|
||||
|
||||
button {
|
||||
border: 1px solid #2a8aad;
|
||||
background: #3584e4;
|
||||
font-weight: normal;
|
||||
color: #fff;
|
||||
}
|
||||
button:disabled {
|
||||
background: #9cbce5;
|
||||
}
|
||||
|
||||
#write form {
|
||||
margin: 1em 0 0 0;
|
||||
display: flex;
|
||||
|
@ -406,7 +426,7 @@
|
|||
throw new Error("already running");
|
||||
}
|
||||
controller.value = new AbortController();
|
||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: document.baseURI.replace(/\/+$/, '') })) {
|
||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
|
||||
const data = chunk.data;
|
||||
|
||||
if (data.stop) {
|
||||
|
@ -567,7 +587,7 @@
|
|||
runCompletion();
|
||||
}
|
||||
return html`
|
||||
<div>
|
||||
<div class="right">
|
||||
<button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
|
||||
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
|
||||
<button onclick=${reset}>Reset</button>
|
||||
|
@ -1015,6 +1035,10 @@
|
|||
}
|
||||
|
||||
function App(props) {
|
||||
useEffect(() => {
|
||||
const query = new URLSearchParams(location.search).get("q");
|
||||
if (query) chat(query);
|
||||
}, []);
|
||||
|
||||
return html`
|
||||
<div class="mode-${session.value.type}">
|
||||
|
|
|
@ -61,7 +61,10 @@ enum server_task_type {
|
|||
SERVER_TASK_TYPE_COMPLETION,
|
||||
SERVER_TASK_TYPE_CANCEL,
|
||||
SERVER_TASK_TYPE_NEXT_RESPONSE,
|
||||
SERVER_TASK_TYPE_METRICS
|
||||
SERVER_TASK_TYPE_METRICS,
|
||||
SERVER_TASK_TYPE_SLOT_SAVE,
|
||||
SERVER_TASK_TYPE_SLOT_RESTORE,
|
||||
SERVER_TASK_TYPE_SLOT_ERASE,
|
||||
};
|
||||
|
||||
struct server_task {
|
||||
|
@ -128,6 +131,7 @@ struct server_params {
|
|||
|
||||
bool slots_endpoint = true;
|
||||
bool metrics_endpoint = false;
|
||||
std::string slot_save_path;
|
||||
};
|
||||
|
||||
struct server_slot {
|
||||
|
@ -685,6 +689,7 @@ struct server_context {
|
|||
n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
add_bos_token = llama_should_add_bos_token(model);
|
||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -754,7 +759,7 @@ struct server_context {
|
|||
metrics.init();
|
||||
}
|
||||
|
||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const {
|
||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_special) const {
|
||||
// TODO: currently, we tokenize using special tokens by default
|
||||
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
|
||||
// but it's better compared to completely ignoring ChatML and other chat templates
|
||||
|
@ -772,7 +777,7 @@ struct server_context {
|
|||
|
||||
std::vector<llama_token> p;
|
||||
if (first) {
|
||||
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
||||
first = false;
|
||||
} else {
|
||||
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||
|
@ -789,7 +794,7 @@ struct server_context {
|
|||
}
|
||||
} else {
|
||||
auto s = json_prompt.template get<std::string>();
|
||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
||||
}
|
||||
|
||||
return prompt_tokens;
|
||||
|
@ -1054,7 +1059,7 @@ struct server_context {
|
|||
system_tokens.clear();
|
||||
|
||||
if (!system_prompt.empty()) {
|
||||
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
|
||||
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
|
||||
|
||||
llama_batch_clear(batch);
|
||||
|
||||
|
@ -1612,6 +1617,107 @@ struct server_context {
|
|||
}
|
||||
queue_results.send(res);
|
||||
} break;
|
||||
case SERVER_TASK_TYPE_SLOT_SAVE:
|
||||
{
|
||||
int id_slot = task.data["id_slot"];
|
||||
server_slot * slot = get_slot(id_slot);
|
||||
if (slot == nullptr) {
|
||||
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||
break;
|
||||
}
|
||||
|
||||
const size_t token_count = slot->cache_tokens.size();
|
||||
const int64_t t_start = ggml_time_us();
|
||||
|
||||
std::string filename = task.data["filename"];
|
||||
std::string filepath = task.data["filepath"];
|
||||
|
||||
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
|
||||
|
||||
const int64_t t_end = ggml_time_us();
|
||||
const double t_save_ms = (t_end - t_start) / 1000.0;
|
||||
|
||||
server_task_result result;
|
||||
result.id = task.id;
|
||||
result.stop = true;
|
||||
result.error = false;
|
||||
result.data = json {
|
||||
{ "id_slot", id_slot },
|
||||
{ "filename", filename },
|
||||
{ "n_saved", token_count }, // tokens saved
|
||||
{ "n_written", nwrite }, // bytes written
|
||||
{ "timings", {
|
||||
{ "save_ms", t_save_ms }
|
||||
} }
|
||||
};
|
||||
queue_results.send(result);
|
||||
} break;
|
||||
case SERVER_TASK_TYPE_SLOT_RESTORE:
|
||||
{
|
||||
int id_slot = task.data["id_slot"];
|
||||
server_slot * slot = get_slot(id_slot);
|
||||
if (slot == nullptr) {
|
||||
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||
break;
|
||||
}
|
||||
|
||||
const int64_t t_start = ggml_time_us();
|
||||
|
||||
std::string filename = task.data["filename"];
|
||||
std::string filepath = task.data["filepath"];
|
||||
|
||||
slot->cache_tokens.resize(slot->n_ctx);
|
||||
size_t token_count = 0;
|
||||
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
|
||||
if (nread == 0) {
|
||||
slot->cache_tokens.resize(0);
|
||||
send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
|
||||
break;
|
||||
}
|
||||
slot->cache_tokens.resize(token_count);
|
||||
|
||||
const int64_t t_end = ggml_time_us();
|
||||
const double t_restore_ms = (t_end - t_start) / 1000.0;
|
||||
|
||||
server_task_result result;
|
||||
result.id = task.id;
|
||||
result.stop = true;
|
||||
result.error = false;
|
||||
result.data = json {
|
||||
{ "id_slot", id_slot },
|
||||
{ "filename", filename },
|
||||
{ "n_restored", token_count }, // tokens restored
|
||||
{ "n_read", nread }, // bytes read
|
||||
{ "timings", {
|
||||
{ "restore_ms", t_restore_ms }
|
||||
} }
|
||||
};
|
||||
queue_results.send(result);
|
||||
} break;
|
||||
case SERVER_TASK_TYPE_SLOT_ERASE:
|
||||
{
|
||||
int id_slot = task.data["id_slot"];
|
||||
server_slot * slot = get_slot(id_slot);
|
||||
if (slot == nullptr) {
|
||||
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||
break;
|
||||
}
|
||||
|
||||
// Erase token cache
|
||||
const size_t n_erased = slot->cache_tokens.size();
|
||||
llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1);
|
||||
slot->cache_tokens.clear();
|
||||
|
||||
server_task_result result;
|
||||
result.id = task.id;
|
||||
result.stop = true;
|
||||
result.error = false;
|
||||
result.data = json {
|
||||
{ "id_slot", id_slot },
|
||||
{ "n_erased", n_erased }
|
||||
};
|
||||
queue_results.send(result);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1809,7 +1915,7 @@ struct server_context {
|
|||
prefix_tokens.push_back(llama_token_middle(model));
|
||||
prompt_tokens = prefix_tokens;
|
||||
} else {
|
||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
|
||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
||||
}
|
||||
|
||||
slot.n_past = 0;
|
||||
|
@ -2249,6 +2355,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
|||
printf(" --log-disable disables logging to a file.\n");
|
||||
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
||||
printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
|
||||
printf(" --slot-save-path PATH path to save slot kv cache (default: disabled)\n");
|
||||
printf("\n");
|
||||
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
|
@ -2657,6 +2764,16 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
|||
sparams.slots_endpoint = false;
|
||||
} else if (arg == "--metrics") {
|
||||
sparams.metrics_endpoint = true;
|
||||
} else if (arg == "--slot-save-path") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.slot_save_path = argv[i];
|
||||
// if doesn't end with DIRECTORY_SEPARATOR, add it
|
||||
if (!sparams.slot_save_path.empty() && sparams.slot_save_path[sparams.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
|
||||
sparams.slot_save_path += DIRECTORY_SEPARATOR;
|
||||
}
|
||||
} else if (arg == "--chat-template") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -3159,6 +3276,112 @@ int main(int argc, char ** argv) {
|
|||
res.status = 200; // HTTP OK
|
||||
};
|
||||
|
||||
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
||||
json request_data = json::parse(req.body);
|
||||
std::string filename = request_data["filename"];
|
||||
if (!validate_file_name(filename)) {
|
||||
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
||||
return;
|
||||
}
|
||||
std::string filepath = sparams.slot_save_path + filename;
|
||||
|
||||
server_task task;
|
||||
task.type = SERVER_TASK_TYPE_SLOT_SAVE;
|
||||
task.data = {
|
||||
{ "id_slot", id_slot },
|
||||
{ "filename", filename },
|
||||
{ "filepath", filepath }
|
||||
};
|
||||
|
||||
const int id_task = ctx_server.queue_tasks.post(task);
|
||||
ctx_server.queue_results.add_waiting_task_id(id_task);
|
||||
|
||||
server_task_result result = ctx_server.queue_results.recv(id_task);
|
||||
ctx_server.queue_results.remove_waiting_task_id(id_task);
|
||||
|
||||
if (result.error) {
|
||||
res_error(res, result.data);
|
||||
} else {
|
||||
res.set_content(result.data.dump(), "application/json");
|
||||
}
|
||||
};
|
||||
|
||||
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
||||
json request_data = json::parse(req.body);
|
||||
std::string filename = request_data["filename"];
|
||||
if (!validate_file_name(filename)) {
|
||||
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
||||
return;
|
||||
}
|
||||
std::string filepath = sparams.slot_save_path + filename;
|
||||
|
||||
server_task task;
|
||||
task.type = SERVER_TASK_TYPE_SLOT_RESTORE;
|
||||
task.data = {
|
||||
{ "id_slot", id_slot },
|
||||
{ "filename", filename },
|
||||
{ "filepath", filepath }
|
||||
};
|
||||
|
||||
const int id_task = ctx_server.queue_tasks.post(task);
|
||||
ctx_server.queue_results.add_waiting_task_id(id_task);
|
||||
|
||||
server_task_result result = ctx_server.queue_results.recv(id_task);
|
||||
ctx_server.queue_results.remove_waiting_task_id(id_task);
|
||||
|
||||
if (result.error) {
|
||||
res_error(res, result.data);
|
||||
} else {
|
||||
res.set_content(result.data.dump(), "application/json");
|
||||
}
|
||||
};
|
||||
|
||||
const auto handle_slots_erase = [&ctx_server, &res_error](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
|
||||
server_task task;
|
||||
task.type = SERVER_TASK_TYPE_SLOT_ERASE;
|
||||
task.data = {
|
||||
{ "id_slot", id_slot },
|
||||
};
|
||||
|
||||
const int id_task = ctx_server.queue_tasks.post(task);
|
||||
ctx_server.queue_results.add_waiting_task_id(id_task);
|
||||
|
||||
server_task_result result = ctx_server.queue_results.recv(id_task);
|
||||
ctx_server.queue_results.remove_waiting_task_id(id_task);
|
||||
|
||||
if (result.error) {
|
||||
res_error(res, result.data);
|
||||
} else {
|
||||
res.set_content(result.data.dump(), "application/json");
|
||||
}
|
||||
};
|
||||
|
||||
const auto handle_slots_action = [&res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) {
|
||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||
|
||||
std::string id_slot_str = req.path_params.at("id_slot");
|
||||
int id_slot;
|
||||
|
||||
try {
|
||||
id_slot = std::stoi(id_slot_str);
|
||||
} catch (const std::exception &) {
|
||||
res_error(res, format_error_response("Invalid slot ID", ERROR_TYPE_INVALID_REQUEST));
|
||||
return;
|
||||
}
|
||||
|
||||
std::string action = req.get_param_value("action");
|
||||
|
||||
if (action == "save") {
|
||||
handle_slots_save(req, res, id_slot);
|
||||
} else if (action == "restore") {
|
||||
handle_slots_restore(req, res, id_slot);
|
||||
} else if (action == "erase") {
|
||||
handle_slots_erase(req, res, id_slot);
|
||||
} else {
|
||||
res_error(res, format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST));
|
||||
}
|
||||
};
|
||||
|
||||
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
|
||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||
json data = {
|
||||
|
@ -3521,6 +3744,10 @@ int main(int argc, char ** argv) {
|
|||
svr->Post("/v1/embeddings", handle_embeddings);
|
||||
svr->Post("/tokenize", handle_tokenize);
|
||||
svr->Post("/detokenize", handle_detokenize);
|
||||
if (!sparams.slot_save_path.empty()) {
|
||||
// only enable slot endpoints if slot_save_path is set
|
||||
svr->Post("/slots/:id_slot", handle_slots_action);
|
||||
}
|
||||
|
||||
//
|
||||
// Start the server
|
||||
|
|
58
examples/server/tests/features/slotsave.feature
Normal file
58
examples/server/tests/features/slotsave.feature
Normal file
|
@ -0,0 +1,58 @@
|
|||
@llama.cpp
|
||||
@slotsave
|
||||
Feature: llama.cpp server slot management
|
||||
|
||||
Background: Server startup
|
||||
Given a server listening on localhost:8080
|
||||
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||
And prompt caching is enabled
|
||||
And 2 slots
|
||||
And . as slot save path
|
||||
And 2048 KV cache size
|
||||
And 42 as server seed
|
||||
And 24 max tokens to predict
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Scenario: Save and Restore Slot
|
||||
# First prompt in slot 1 should be fully processed
|
||||
Given a user prompt "What is the capital of France?"
|
||||
And using slot id 1
|
||||
And a completion request with no api error
|
||||
Then 24 tokens are predicted matching (Lily|cake)
|
||||
And 22 prompt tokens are processed
|
||||
When the slot 1 is saved with filename "slot1.bin"
|
||||
Then the server responds with status code 200
|
||||
# Since we have cache, this should only process the last tokens
|
||||
Given a user prompt "What is the capital of Germany?"
|
||||
And a completion request with no api error
|
||||
Then 24 tokens are predicted matching (Thank|special)
|
||||
And 7 prompt tokens are processed
|
||||
# Loading the original cache into slot 0,
|
||||
# we should only be processing 1 prompt token and get the same output
|
||||
When the slot 0 is restored with filename "slot1.bin"
|
||||
Then the server responds with status code 200
|
||||
Given a user prompt "What is the capital of France?"
|
||||
And using slot id 0
|
||||
And a completion request with no api error
|
||||
Then 24 tokens are predicted matching (Lily|cake)
|
||||
And 1 prompt tokens are processed
|
||||
# For verification that slot 1 was not corrupted during slot 0 load, same thing
|
||||
Given a user prompt "What is the capital of Germany?"
|
||||
And using slot id 1
|
||||
And a completion request with no api error
|
||||
Then 24 tokens are predicted matching (Thank|special)
|
||||
And 1 prompt tokens are processed
|
||||
|
||||
Scenario: Erase Slot
|
||||
Given a user prompt "What is the capital of France?"
|
||||
And using slot id 1
|
||||
And a completion request with no api error
|
||||
Then 24 tokens are predicted matching (Lily|cake)
|
||||
And 22 prompt tokens are processed
|
||||
When the slot 1 is erased
|
||||
Then the server responds with status code 200
|
||||
Given a user prompt "What is the capital of France?"
|
||||
And a completion request with no api error
|
||||
Then 24 tokens are predicted matching (Lily|cake)
|
||||
And 22 prompt tokens are processed
|
|
@ -49,6 +49,9 @@ def step_server_config(context, server_fqdn, server_port):
|
|||
context.n_predict = None
|
||||
context.n_prompts = 0
|
||||
context.n_server_predict = None
|
||||
context.slot_save_path = None
|
||||
context.id_slot = None
|
||||
context.cache_prompt = None
|
||||
context.n_slots = None
|
||||
context.prompt_prefix = None
|
||||
context.prompt_suffix = None
|
||||
|
@ -119,6 +122,21 @@ def step_server_n_predict(context, n_predict):
|
|||
context.n_server_predict = n_predict
|
||||
|
||||
|
||||
@step('{slot_save_path} as slot save path')
|
||||
def step_slot_save_path(context, slot_save_path):
|
||||
context.slot_save_path = slot_save_path
|
||||
|
||||
|
||||
@step('using slot id {id_slot:d}')
|
||||
def step_id_slot(context, id_slot):
|
||||
context.id_slot = id_slot
|
||||
|
||||
|
||||
@step('prompt caching is enabled')
|
||||
def step_enable_prompt_cache(context):
|
||||
context.cache_prompt = True
|
||||
|
||||
|
||||
@step('continuous batching')
|
||||
def step_server_continuous_batching(context):
|
||||
context.server_continuous_batching = True
|
||||
|
@ -212,6 +230,8 @@ async def step_request_completion(context, api_error):
|
|||
context.base_url,
|
||||
debug=context.debug,
|
||||
n_predict=context.n_predict,
|
||||
cache_prompt=context.cache_prompt,
|
||||
id_slot=context.id_slot,
|
||||
seed=await completions_seed(context),
|
||||
expect_api_error=expect_api_error,
|
||||
user_api_key=context.user_api_key)
|
||||
|
@ -711,12 +731,48 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
|
|||
await asyncio.sleep(0.1)
|
||||
|
||||
|
||||
@step('the slot {slot_id:d} is saved with filename "{filename}"')
|
||||
@async_run_until_complete
|
||||
async def step_save_slot(context, slot_id, filename):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(f'{context.base_url}/slots/{slot_id}?action=save',
|
||||
json={"filename": filename},
|
||||
headers={"Content-Type": "application/json"}) as response:
|
||||
context.response = response
|
||||
|
||||
|
||||
@step('the slot {slot_id:d} is restored with filename "{filename}"')
|
||||
@async_run_until_complete
|
||||
async def step_restore_slot(context, slot_id, filename):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(f'{context.base_url}/slots/{slot_id}?action=restore',
|
||||
json={"filename": filename},
|
||||
headers={"Content-Type": "application/json"}) as response:
|
||||
context.response = response
|
||||
|
||||
|
||||
@step('the slot {slot_id:d} is erased')
|
||||
@async_run_until_complete
|
||||
async def step_erase_slot(context, slot_id):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(f'{context.base_url}/slots/{slot_id}?action=erase',
|
||||
headers={"Content-Type": "application/json"}) as response:
|
||||
context.response = response
|
||||
|
||||
|
||||
@step('the server responds with status code {status_code:d}')
|
||||
def step_server_responds_with_status_code(context, status_code):
|
||||
assert context.response.status == status_code
|
||||
|
||||
|
||||
async def request_completion(prompt,
|
||||
base_url,
|
||||
debug=False,
|
||||
prompt_prefix=None,
|
||||
prompt_suffix=None,
|
||||
n_predict=None,
|
||||
cache_prompt=False,
|
||||
id_slot=None,
|
||||
seed=None,
|
||||
expect_api_error=None,
|
||||
user_api_key=None):
|
||||
|
@ -738,6 +794,8 @@ async def request_completion(prompt,
|
|||
"prompt": prompt,
|
||||
"input_suffix": prompt_suffix,
|
||||
"n_predict": n_predict if n_predict is not None else -1,
|
||||
"cache_prompt": cache_prompt,
|
||||
"id_slot": id_slot,
|
||||
"seed": seed if seed is not None else 42
|
||||
},
|
||||
headers=headers,
|
||||
|
@ -1104,6 +1162,8 @@ def start_server_background(context):
|
|||
server_args.extend(['--parallel', context.n_slots])
|
||||
if context.n_server_predict:
|
||||
server_args.extend(['--n-predict', context.n_server_predict])
|
||||
if context.slot_save_path:
|
||||
server_args.extend(['--slot-save-path', context.slot_save_path])
|
||||
if context.server_api_key:
|
||||
server_args.extend(['--api-key', context.server_api_key])
|
||||
if context.n_ga:
|
||||
|
|
|
@ -76,6 +76,28 @@ int main(int argc, char ** argv) {
|
|||
params.n_threads_batch = params.n_threads_batch_draft;
|
||||
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
||||
|
||||
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
|
||||
LOG("vocab_type tgt: %d\n", vocab_type_tgt);
|
||||
|
||||
const bool vocab_type_dft = llama_vocab_type(model_dft);
|
||||
LOG("vocab_type dft: %d\n", vocab_type_dft);
|
||||
|
||||
if (vocab_type_tgt != vocab_type_dft) {
|
||||
fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__);
|
||||
fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (
|
||||
llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
|
||||
llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
|
||||
llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
|
||||
llama_token_eos(model_tgt) != llama_token_eos(model_dft)
|
||||
) {
|
||||
fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
{
|
||||
const int n_vocab_tgt = llama_n_vocab(model_tgt);
|
||||
const int n_vocab_dft = llama_n_vocab(model_dft);
|
||||
|
@ -105,20 +127,8 @@ int main(int argc, char ** argv) {
|
|||
|
||||
|
||||
// Tokenize the prompt
|
||||
const bool add_bos_tgt = llama_should_add_bos_token(model_tgt);
|
||||
LOG("add_bos tgt: %d\n", add_bos_tgt);
|
||||
|
||||
const bool add_bos_dft = llama_should_add_bos_token(model_dft);
|
||||
LOG("add_bos dft: %d\n", add_bos_dft);
|
||||
|
||||
if (add_bos_tgt != add_bos_dft) {
|
||||
fprintf(stderr, "%s: error: draft model add_bos must match target model to use speculation but ", __func__);
|
||||
fprintf(stderr, "add_bos_dft = %d while add_bos_tgt = %d\n", add_bos_dft, add_bos_tgt);
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true);
|
||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true);
|
||||
|
||||
const int max_context_size = llama_n_ctx(ctx_tgt);
|
||||
const int max_tokens_list_size = max_context_size - 4;
|
||||
|
|
|
@ -26,11 +26,9 @@ int main(int argc, char ** argv) {
|
|||
llama_context_params ctx_params = llama_context_default_params();
|
||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
|
||||
std::vector<llama_token> tokens;
|
||||
|
||||
tokens = ::llama_tokenize(model, prompt, add_bos, true);
|
||||
tokens = ::llama_tokenize(model, prompt, true, true);
|
||||
|
||||
for (int i = 0; i < (int) tokens.size(); i++) {
|
||||
if (printing_ids) {
|
||||
|
|
18
flake.lock
generated
18
flake.lock
generated
|
@ -5,11 +5,11 @@
|
|||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1709336216,
|
||||
"narHash": "sha256-Dt/wOWeW6Sqm11Yh+2+t0dfEWxoMxGBvv3JpIocFl9E=",
|
||||
"lastModified": 1712014858,
|
||||
"narHash": "sha256-sB4SWl2lX95bExY2gMFG5HIzvva5AVMJd4Igm+GpZNw=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "f7b3c975cf067e56e7cda6cb098ebe3fb4d74ca2",
|
||||
"rev": "9126214d0a59633752a136528f5f3b9aa8565b7d",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
@ -20,11 +20,11 @@
|
|||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1711703276,
|
||||
"narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
|
||||
"lastModified": 1712163089,
|
||||
"narHash": "sha256-Um+8kTIrC19vD4/lUCN9/cU9kcOsD1O1m+axJqQPyMM=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
|
||||
"rev": "fd281bd6b7d3e32ddfa399853946f782553163b5",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
@ -37,11 +37,11 @@
|
|||
"nixpkgs-lib": {
|
||||
"locked": {
|
||||
"dir": "lib",
|
||||
"lastModified": 1709237383,
|
||||
"narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=",
|
||||
"lastModified": 1711703276,
|
||||
"narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8",
|
||||
"rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
|
|
@ -1225,7 +1225,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|||
|
||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||
// ldc == nrows of the matrix that cuBLAS writes into
|
||||
int ldc = id == ctx.device ? ne0 : row_diff;
|
||||
int64_t ldc = id == ctx.device ? ne0 : row_diff;
|
||||
|
||||
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
||||
|
||||
|
@ -1377,8 +1377,8 @@ static void ggml_cuda_op_mul_mat(
|
|||
const int64_t ne0 = dst->ne[0];
|
||||
const int64_t ne1 = dst->ne[1];
|
||||
|
||||
const int nb2 = dst->nb[2];
|
||||
const int nb3 = dst->nb[3];
|
||||
const int64_t nb2 = dst->nb[2];
|
||||
const int64_t nb3 = dst->nb[3];
|
||||
|
||||
GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
|
||||
GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
|
||||
|
|
|
@ -394,7 +394,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
|||
// TODO: move to ggml-common.h
|
||||
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||
|
||||
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
||||
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
||||
|
||||
|
||||
//////////////////////
|
||||
|
|
|
@ -4,14 +4,14 @@
|
|||
#define CUDA_Q8_0_NE_ALIGN 2048
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
||||
const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
||||
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
||||
const int64_t i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int ib = i/qk; // block index
|
||||
const int64_t ib = i/qk; // block index
|
||||
const int iqs = (i%qk)/qr; // quant index
|
||||
const int iybs = i - i%qk; // y block start index
|
||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||
|
@ -25,7 +25,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
|
|||
}
|
||||
|
||||
template <bool need_check>
|
||||
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
|
||||
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
|
||||
#if __CUDA_ARCH__ >= CC_PASCAL
|
||||
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
||||
|
||||
|
@ -68,13 +68,13 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
|
|||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const int64_t i = blockIdx.x;
|
||||
|
||||
// assume 32 threads
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8;
|
||||
const int ir = tid%8;
|
||||
const int ib = 8*i + ir;
|
||||
const int64_t ib = 8*i + ir;
|
||||
if (ib >= nb32) {
|
||||
return;
|
||||
}
|
||||
|
@ -96,13 +96,13 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t
|
|||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const int64_t i = blockIdx.x;
|
||||
|
||||
// assume 32 threads
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8;
|
||||
const int ir = tid%8;
|
||||
const int ib = 8*i + ir;
|
||||
const int64_t ib = 8*i + ir;
|
||||
if (ib >= nb32) {
|
||||
return;
|
||||
}
|
||||
|
@ -313,14 +313,14 @@ template<typename dst_t>
|
|||
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const block_q6_K * x = (const block_q6_K *) vx;
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const int64_t i = blockIdx.x;
|
||||
#if QK_K == 256
|
||||
|
||||
// assume 64 threads - this is very slightly better than the one below
|
||||
const int tid = threadIdx.x;
|
||||
const int ip = tid/32; // ip is 0 or 1
|
||||
const int il = tid - 32*ip; // 0...32
|
||||
const int is = 8*ip + il/16;
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t ip = tid/32; // ip is 0 or 1
|
||||
const int64_t il = tid - 32*ip; // 0...32
|
||||
const int64_t is = 8*ip + il/16;
|
||||
|
||||
dst_t * y = yy + i*QK_K + 128*ip + il;
|
||||
|
||||
|
@ -337,9 +337,9 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
|||
#else
|
||||
|
||||
// assume 32 threads
|
||||
const int tid = threadIdx.x;
|
||||
const int ip = tid/16; // 0 or 1
|
||||
const int il = tid - 16*ip; // 0...15
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t ip = tid/16; // 0 or 1
|
||||
const int64_t il = tid - 16*ip; // 0...15
|
||||
|
||||
dst_t * y = yy + i*QK_K + 16*ip + il;
|
||||
|
||||
|
@ -571,12 +571,12 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
|
|||
#endif
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
|
||||
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||
}
|
||||
|
||||
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
|
||||
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
|
||||
const bool need_check = false;
|
||||
|
@ -588,7 +588,7 @@ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half *
|
|||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
|
@ -598,7 +598,7 @@ static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
|
@ -608,27 +608,27 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb32 = k / 32;
|
||||
const int nb = (k + 255) / 256;
|
||||
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb32 = k / 32;
|
||||
const int nb = (k + 255) / 256;
|
||||
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
|
@ -638,7 +638,7 @@ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
|
@ -648,55 +648,55 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = (k + QK_K - 1) / QK_K;
|
||||
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = (k + QK_K - 1) / QK_K;
|
||||
#if QK_K == 64
|
||||
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
||||
|
@ -706,8 +706,8 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k,
|
|||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
||||
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
|
@ -719,7 +719,7 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
|
|||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
||||
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||
}
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
||||
|
||||
template<typename T>
|
||||
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
|
||||
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
|
||||
|
||||
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
||||
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#include "common.cuh"
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
||||
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
const block_q4_0 * x = (const block_q4_0 *) vx;
|
||||
|
||||
const dfloat d = x[ib].d;
|
||||
|
@ -19,7 +19,7 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
|
|||
#endif // GGML_CUDA_F16
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
||||
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
const block_q4_1 * x = (const block_q4_1 *) vx;
|
||||
|
||||
const dfloat d = __low2half(x[ib].dm);
|
||||
|
@ -39,7 +39,7 @@ static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const in
|
|||
#endif // GGML_CUDA_F16
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
||||
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
const block_q5_0 * x = (const block_q5_0 *) vx;
|
||||
|
||||
const dfloat d = x[ib].d;
|
||||
|
@ -62,7 +62,7 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
|
|||
#endif // GGML_CUDA_F16
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
||||
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
const block_q5_1 * x = (const block_q5_1 *) vx;
|
||||
|
||||
const dfloat d = __low2half(x[ib].dm);
|
||||
|
@ -86,7 +86,7 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
|
|||
#endif // GGML_CUDA_F16
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
||||
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
const block_q8_0 * x = (const block_q8_0 *) vx;
|
||||
|
||||
const dfloat d = x[ib].d;
|
||||
|
|
|
@ -565,7 +565,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
|||
}
|
||||
}
|
||||
|
||||
static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
||||
static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
const half * x = (const half *) vx;
|
||||
|
||||
// automatic half -> float type cast if dfloat == float
|
||||
|
@ -577,7 +577,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
|||
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
||||
// qk = quantized weights per x block
|
||||
// qr = number of quantized weights per data value in x block
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
|
||||
|
||||
if (row >= nrows) {
|
||||
return;
|
||||
|
@ -598,7 +598,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|||
|
||||
for (int i = 0; i < ncols; i += iter_stride) {
|
||||
const int col = i + vals_per_iter*tid;
|
||||
const int ib = (row*ncols + col)/qk; // x block index
|
||||
const int64_t ib = ((int64_t)row*ncols + col)/qk; // x block index
|
||||
const int iqs = (col%qk)/qr; // x quant index
|
||||
const int iybs = col - col%qk; // y block start index
|
||||
|
||||
|
|
|
@ -1,20 +1,20 @@
|
|||
#include "quantize.cuh"
|
||||
|
||||
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
|
||||
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) {
|
||||
const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (ix >= kx_padded) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int iy = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y;
|
||||
|
||||
const int i_padded = iy*kx_padded + ix;
|
||||
const int64_t i_padded = (int64_t)iy*kx_padded + ix;
|
||||
|
||||
block_q8_1 * y = (block_q8_1 *) vy;
|
||||
|
||||
const int ib = i_padded / QK8_1; // block index
|
||||
const int iqs = i_padded % QK8_1; // quant index
|
||||
const int64_t ib = i_padded / QK8_1; // block index
|
||||
const int64_t iqs = i_padded % QK8_1; // quant index
|
||||
|
||||
const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
|
||||
float amax = fabsf(xi);
|
||||
|
@ -36,8 +36,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|||
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
||||
}
|
||||
|
||||
void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
|
||||
const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
||||
void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) {
|
||||
const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
||||
const dim3 num_blocks(block_num_x, ky, 1);
|
||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
||||
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
||||
|
|
|
@ -2,4 +2,4 @@
|
|||
|
||||
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
||||
|
||||
void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream);
|
||||
void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream);
|
||||
|
|
310
ggml-quants.c
310
ggml-quants.c
File diff suppressed because it is too large
Load diff
148
ggml-quants.h
148
ggml-quants.h
|
@ -12,70 +12,70 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
// Quantization
|
||||
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
// Dequantization
|
||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
@ -101,26 +101,26 @@ void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
||||
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
void iq2xs_init_impl(enum ggml_type type);
|
||||
void iq2xs_free_impl(enum ggml_type type);
|
||||
|
|
|
@ -15776,7 +15776,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|||
#ifdef GGML_SYCL_FORCE_DMMV
|
||||
const bool use_mul_mat_vec_q = false;
|
||||
#else
|
||||
bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
|
||||
bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
|
||||
use_mul_mat_vec_q = use_mul_mat_vec_q ||
|
||||
(src0->type == GGML_TYPE_IQ2_XXS) || (src0->type == GGML_TYPE_IQ2_XS) || (src0->type == GGML_TYPE_IQ2_S) ||
|
||||
(src0->type == GGML_TYPE_IQ3_XXS) || (src0->type == GGML_TYPE_IQ3_S) ||
|
||||
|
@ -15787,7 +15787,6 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|||
#endif // GGML_SYCL_FORCE_DMMV
|
||||
|
||||
if (use_mul_mat_vec_q) {
|
||||
// NOTE: this kernel does not support ggml_nrows(src1) > 1
|
||||
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n");
|
||||
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
|
||||
} else {
|
||||
|
|
16
ggml.c
16
ggml.c
|
@ -338,14 +338,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|||
return GGML_FP32_TO_FP16(x);
|
||||
}
|
||||
|
||||
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
|
||||
for (int64_t i = 0; i < n; i++) {
|
||||
y[i] = GGML_FP16_TO_FP32(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
|
||||
int i = 0;
|
||||
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
#if defined(__F16C__)
|
||||
for (; i + 7 < n; i += 8) {
|
||||
__m256 x_vec = _mm256_loadu_ps(x + i);
|
||||
|
@ -20331,11 +20331,11 @@ size_t ggml_quantize_chunk(
|
|||
enum ggml_type type,
|
||||
const float * src,
|
||||
void * dst,
|
||||
int start,
|
||||
int nrows,
|
||||
int n_per_row,
|
||||
int64_t start,
|
||||
int64_t nrows,
|
||||
int64_t n_per_row,
|
||||
const float * imatrix) {
|
||||
const int n = nrows * n_per_row;
|
||||
const int64_t n = (int64_t) nrows * n_per_row;
|
||||
|
||||
if (ggml_quantize_requires_imatrix(type)) {
|
||||
GGML_ASSERT(imatrix != NULL);
|
||||
|
|
14
ggml.h
14
ggml.h
|
@ -332,8 +332,8 @@ extern "C" {
|
|||
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
||||
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
||||
|
||||
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
|
||||
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
|
||||
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
|
||||
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
|
||||
|
||||
struct ggml_object;
|
||||
struct ggml_context;
|
||||
|
@ -2210,9 +2210,9 @@ extern "C" {
|
|||
enum ggml_type type,
|
||||
const float * src,
|
||||
void * dst,
|
||||
int start,
|
||||
int nrows,
|
||||
int n_per_row,
|
||||
int64_t start,
|
||||
int64_t nrows,
|
||||
int64_t n_per_row,
|
||||
const float * imatrix);
|
||||
|
||||
//
|
||||
|
@ -2377,8 +2377,8 @@ extern "C" {
|
|||
#else
|
||||
#define GGML_RESTRICT restrict
|
||||
#endif
|
||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||
|
||||
|
|
|
@ -641,6 +641,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
],
|
||||
MODEL_ARCH.DBRX: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
|
|
|
@ -293,12 +293,14 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
||||
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
||||
"model.layers.{bid}.self_attn.q_norm", # cohere
|
||||
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_K_NORM: (
|
||||
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
||||
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
||||
"model.layers.{bid}.self_attn.k_norm", # cohere
|
||||
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
||||
),
|
||||
|
||||
|
|
89
llama.h
89
llama.h
|
@ -37,10 +37,14 @@
|
|||
|
||||
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
|
||||
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 5
|
||||
|
||||
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
|
||||
#define LLAMA_STATE_SEQ_VERSION 1
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
@ -523,6 +527,7 @@ extern "C" {
|
|||
struct llama_context * ctx);
|
||||
|
||||
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
||||
// seq_id < 0 : match any sequence
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
|
@ -594,34 +599,92 @@ extern "C" {
|
|||
|
||||
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
||||
// and kv_cache) - will often be smaller after compacting tokens
|
||||
LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
|
||||
LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
|
||||
LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
|
||||
"use llama_state_get_size instead");
|
||||
|
||||
// Copies the state to the specified destination address.
|
||||
// Destination needs to have allocated enough memory.
|
||||
// Returns the number of bytes copied
|
||||
LLAMA_API size_t llama_copy_state_data(
|
||||
LLAMA_API size_t llama_state_get_data(
|
||||
struct llama_context * ctx,
|
||||
uint8_t * dst);
|
||||
LLAMA_API DEPRECATED(size_t llama_copy_state_data(
|
||||
struct llama_context * ctx,
|
||||
uint8_t * dst),
|
||||
"use llama_state_get_data instead");
|
||||
|
||||
// Set the state reading from the specified address
|
||||
// Returns the number of bytes read
|
||||
LLAMA_API size_t llama_set_state_data(
|
||||
LLAMA_API size_t llama_state_set_data(
|
||||
struct llama_context * ctx,
|
||||
const uint8_t * src);
|
||||
LLAMA_API DEPRECATED(size_t llama_set_state_data(
|
||||
struct llama_context * ctx,
|
||||
const uint8_t * src),
|
||||
"use llama_state_set_data instead");
|
||||
|
||||
// Save/load session file
|
||||
LLAMA_API bool llama_load_session_file(
|
||||
LLAMA_API bool llama_state_load_file(
|
||||
struct llama_context * ctx,
|
||||
const char * path_session,
|
||||
llama_token * tokens_out,
|
||||
size_t n_token_capacity,
|
||||
size_t * n_token_count_out);
|
||||
LLAMA_API DEPRECATED(bool llama_load_session_file(
|
||||
struct llama_context * ctx,
|
||||
const char * path_session,
|
||||
llama_token * tokens_out,
|
||||
size_t n_token_capacity,
|
||||
size_t * n_token_count_out),
|
||||
"use llama_state_load_file instead");
|
||||
|
||||
LLAMA_API bool llama_save_session_file(
|
||||
LLAMA_API bool llama_state_save_file(
|
||||
struct llama_context * ctx,
|
||||
const char * path_session,
|
||||
const llama_token * tokens,
|
||||
size_t n_token_count);
|
||||
LLAMA_API DEPRECATED(bool llama_save_session_file(
|
||||
struct llama_context * ctx,
|
||||
const char * path_session,
|
||||
const llama_token * tokens,
|
||||
size_t n_token_count),
|
||||
"use llama_state_save_file instead");
|
||||
|
||||
// Get the exact size needed to copy the KV cache of a single sequence
|
||||
LLAMA_API size_t llama_state_seq_get_size(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id);
|
||||
|
||||
// Copy the KV cache of a single sequence into the specified buffer
|
||||
LLAMA_API size_t llama_state_seq_get_data(
|
||||
struct llama_context * ctx,
|
||||
uint8_t * dst,
|
||||
llama_seq_id seq_id);
|
||||
|
||||
// Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
|
||||
// Returns:
|
||||
// - Positive: Ok
|
||||
// - Zero: Failed to load
|
||||
LLAMA_API size_t llama_state_seq_set_data(
|
||||
struct llama_context * ctx,
|
||||
const uint8_t * src,
|
||||
llama_seq_id dest_seq_id);
|
||||
|
||||
LLAMA_API size_t llama_state_seq_save_file(
|
||||
struct llama_context * ctx,
|
||||
const char * filepath,
|
||||
llama_seq_id seq_id,
|
||||
const llama_token * tokens,
|
||||
size_t n_token_count);
|
||||
|
||||
LLAMA_API size_t llama_state_seq_load_file(
|
||||
struct llama_context * ctx,
|
||||
const char * filepath,
|
||||
llama_seq_id dest_seq_id,
|
||||
llama_token * tokens_out,
|
||||
size_t n_token_capacity,
|
||||
size_t * n_token_count_out);
|
||||
|
||||
//
|
||||
// Decoding
|
||||
|
@ -684,8 +747,9 @@ extern "C" {
|
|||
// Cols: n_vocab
|
||||
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
||||
|
||||
// Logits for the ith token. Equivalent to:
|
||||
// Logits for the ith token. For positive indices, Equivalent to:
|
||||
// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
||||
// Negative indicies can be used to access logits in reverse order, -1 is the last logit.
|
||||
// returns NULL for invalid ids.
|
||||
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
||||
|
||||
|
@ -697,8 +761,9 @@ extern "C" {
|
|||
// Otherwise, returns NULL.
|
||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
|
||||
// Get the embeddings for the ith token. Equivalent to:
|
||||
// Get the embeddings for the ith token. For positive indices, Equivalent to:
|
||||
// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
||||
// Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
|
||||
// shape: [n_embd] (1-dimensional)
|
||||
// returns NULL for invalid ids.
|
||||
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
||||
|
@ -721,6 +786,8 @@ extern "C" {
|
|||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
|
||||
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
|
||||
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
||||
|
||||
// Returns -1 if unknown, 1 for true or 0 for false.
|
||||
|
@ -743,16 +810,16 @@ extern "C" {
|
|||
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
||||
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
||||
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
||||
/// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
|
||||
/// Does not insert a leading space.
|
||||
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
||||
/// as plaintext. Does not insert a leading space.
|
||||
LLAMA_API int32_t llama_tokenize(
|
||||
const struct llama_model * model,
|
||||
const char * text,
|
||||
int32_t text_len,
|
||||
llama_token * tokens,
|
||||
int32_t n_tokens_max,
|
||||
bool add_bos,
|
||||
bool special);
|
||||
bool add_special,
|
||||
bool parse_special);
|
||||
|
||||
// Token Id -> Piece.
|
||||
// Uses the vocabulary in the provided context.
|
||||
|
|
9
scripts/gen-authors.sh
Executable file
9
scripts/gen-authors.sh
Executable file
|
@ -0,0 +1,9 @@
|
|||
#!/bin/bash
|
||||
|
||||
printf "# date: $(date)\n" > AUTHORS
|
||||
printf "# this file is auto-generated by scripts/gen-authors.sh\n\n" >> AUTHORS
|
||||
|
||||
git log --format='%an <%ae>' --reverse --date=short master | awk '!seen[$0]++' | sort >> AUTHORS
|
||||
|
||||
# if necessary, update your name here. for example: jdoe -> John Doe
|
||||
sed -i '' 's/^jdoe/John Doe/g' AUTHORS
|
|
@ -66,6 +66,8 @@ while read c; do
|
|||
tests/test-quantize-fns.cpp \
|
||||
tests/test-quantize-perf.cpp \
|
||||
tests/test-backend-ops.cpp \
|
||||
LICENSE \
|
||||
scripts/gen-authors.sh \
|
||||
>> $SRC_LLAMA/ggml-src.patch
|
||||
done < $SRC_LLAMA/ggml-commits
|
||||
|
||||
|
@ -123,6 +125,9 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
|||
# tests/test-quantize-fns.cpp -> tests/test-quantize-fns.cpp
|
||||
# tests/test-quantize-perf.cpp -> tests/test-quantize-perf.cpp
|
||||
# tests/test-backend-ops.cpp -> tests/test-backend-ops.cpp
|
||||
#
|
||||
# LICENSE -> LICENSE
|
||||
# scripts/gen-authors.sh -> scripts/gen-authors.sh
|
||||
|
||||
cat ggml-src.patch | sed \
|
||||
-e 's/src\/ggml\.c/ggml.c/g' \
|
||||
|
@ -156,6 +161,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
|||
-e 's/tests\/test-quantize-fns\.cpp/tests\/test-quantize-fns.cpp/g' \
|
||||
-e 's/tests\/test-quantize-perf\.cpp/tests\/test-quantize-perf.cpp/g' \
|
||||
-e 's/tests\/test-backend-ops\.cpp/tests\/test-backend-ops.cpp/g' \
|
||||
-e 's/LICENSE/LICENSE/g' \
|
||||
-e 's/scripts\/gen-authors\.sh/scripts\/gen-authors.sh/g' \
|
||||
> ggml-src.patch.tmp
|
||||
mv ggml-src.patch.tmp ggml-src.patch
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
bb8d8cff851b2de6fde4904be492d39458837e1a
|
||||
98875cdb7e9ceeb726d1c196d2fecb3cbb59b93a
|
||||
|
|
|
@ -31,3 +31,6 @@ cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h
|
|||
cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp
|
||||
cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp
|
||||
cp -rpv ../ggml/tests/test-backend-ops.cpp ./tests/test-backend-ops.cpp
|
||||
|
||||
cp -rpv ../LICENSE ./LICENSE
|
||||
cp -rpv ../ggml/scripts/gen-authors.sh ./scripts/gen-authors.sh
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue